Spaces:
Running
Running
slaren
commited on
ggml-alloc : v3 (ggml/727)
Browse files* ggml-alloc v3
ggml-ci
* fix ci
ggml-ci
* whisper : check for backend buffer allocation failures
* whisper : avoid leaks when initialization fails
* cleanup
ggml-ci
* style fixes
ggml-ci
- ggml-alloc.c +563 -490
- ggml-alloc.h +39 -65
- ggml-backend.c +225 -258
- ggml-backend.h +5 -10
- ggml.c +19 -9
- ggml.h +15 -3
- whisper.cpp +175 -225
ggml-alloc.c
CHANGED
|
@@ -17,6 +17,50 @@
|
|
| 17 |
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
| 18 |
#define AT_PRINTF(...)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
// TODO: GGML_PAD ?
|
| 21 |
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
| 22 |
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
|
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|
| 24 |
return offset + align;
|
| 25 |
}
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
struct free_block {
|
| 28 |
-
|
| 29 |
size_t size;
|
| 30 |
};
|
| 31 |
|
| 32 |
-
struct
|
| 33 |
-
struct ggml_backend_buffer * buffer;
|
| 34 |
-
bool buffer_owned;
|
| 35 |
-
void * base;
|
| 36 |
size_t alignment;
|
| 37 |
-
|
| 38 |
int n_free_blocks;
|
| 39 |
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
| 40 |
-
|
| 41 |
size_t max_size;
|
| 42 |
|
| 43 |
-
bool measure;
|
| 44 |
-
|
| 45 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 46 |
-
struct
|
|
|
|
|
|
|
|
|
|
| 47 |
#endif
|
| 48 |
};
|
| 49 |
|
| 50 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 51 |
-
static void add_allocated_tensor(
|
| 52 |
for (int i = 0; i < 1024; i++) {
|
| 53 |
-
if (alloc->allocated_tensors[i] == NULL) {
|
| 54 |
-
alloc->allocated_tensors[i] = tensor;
|
|
|
|
| 55 |
return;
|
| 56 |
}
|
| 57 |
}
|
| 58 |
GGML_ASSERT(!"out of allocated_tensors");
|
| 59 |
}
|
| 60 |
-
static void remove_allocated_tensor(
|
| 61 |
for (int i = 0; i < 1024; i++) {
|
| 62 |
-
if (alloc->allocated_tensors[i] ==
|
| 63 |
-
|
| 64 |
-
alloc->allocated_tensors[i] = NULL;
|
| 65 |
return;
|
| 66 |
}
|
| 67 |
}
|
| 68 |
-
|
| 69 |
GGML_ASSERT(!"tensor not found");
|
| 70 |
}
|
| 71 |
#endif
|
| 72 |
|
| 73 |
-
|
| 74 |
-
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
| 75 |
-
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
| 79 |
-
return t->view_src != NULL;
|
| 80 |
-
}
|
| 81 |
-
|
| 82 |
-
void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
| 83 |
-
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
| 84 |
-
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
| 85 |
-
|
| 86 |
-
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
| 87 |
size = aligned_offset(NULL, size, alloc->alignment);
|
| 88 |
|
| 89 |
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
|
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
| 109 |
if (block->size >= size) {
|
| 110 |
best_fit_block = alloc->n_free_blocks - 1;
|
| 111 |
} else {
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
GGML_ASSERT(!"not enough space in the buffer");
|
| 115 |
-
|
| 116 |
}
|
| 117 |
}
|
| 118 |
|
| 119 |
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
| 120 |
-
|
| 121 |
-
block->
|
| 122 |
block->size -= size;
|
| 123 |
if (block->size == 0) {
|
| 124 |
// remove block if empty
|
|
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
| 128 |
}
|
| 129 |
}
|
| 130 |
|
| 131 |
-
AT_PRINTF("block %d,
|
| 132 |
-
|
| 133 |
-
tensor->data = addr;
|
| 134 |
-
tensor->buffer = alloc->buffer;
|
| 135 |
-
if (!alloc->measure) {
|
| 136 |
-
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
| 137 |
-
}
|
| 138 |
|
| 139 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 140 |
-
add_allocated_tensor(alloc, tensor);
|
| 141 |
-
size_t cur_max =
|
| 142 |
if (cur_max > alloc->max_size) {
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
for (int i = 0; i < 1024; i++) {
|
| 145 |
-
if (alloc->allocated_tensors[i]) {
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
| 147 |
}
|
| 148 |
}
|
| 149 |
-
|
| 150 |
}
|
| 151 |
#endif
|
| 152 |
|
| 153 |
-
alloc->max_size = MAX(alloc->max_size,
|
| 154 |
-
}
|
| 155 |
|
| 156 |
-
|
| 157 |
-
static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
| 158 |
-
if (ggml_tallocr_is_own(alloc, tensor) == false) {
|
| 159 |
-
// the tensor was not allocated in this buffer
|
| 160 |
-
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
| 161 |
-
// the easiest way to deal with this is just to ignore it
|
| 162 |
-
// AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
| 163 |
-
return;
|
| 164 |
-
}
|
| 165 |
|
| 166 |
-
|
|
|
|
| 167 |
|
| 168 |
-
|
|
|
|
| 169 |
size = aligned_offset(NULL, size, alloc->alignment);
|
| 170 |
-
|
|
|
|
| 171 |
|
| 172 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 173 |
-
remove_allocated_tensor(alloc, tensor);
|
| 174 |
#endif
|
| 175 |
|
| 176 |
// see if we can merge with an existing block
|
| 177 |
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
| 178 |
struct free_block * block = &alloc->free_blocks[i];
|
| 179 |
// check if ptr is at the end of the block
|
| 180 |
-
if (
|
| 181 |
block->size += size;
|
| 182 |
// check if we can merge with the next block
|
| 183 |
-
if (i < alloc->n_free_blocks - 1 &&
|
| 184 |
block->size += alloc->free_blocks[i+1].size;
|
| 185 |
alloc->n_free_blocks--;
|
| 186 |
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
|
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
| 190 |
return;
|
| 191 |
}
|
| 192 |
// check if ptr is at the beginning of the block
|
| 193 |
-
if (
|
| 194 |
-
block->
|
| 195 |
block->size += size;
|
| 196 |
// check if we can merge with the previous block
|
| 197 |
-
if (i > 0 &&
|
| 198 |
alloc->free_blocks[i-1].size += block->size;
|
| 199 |
alloc->n_free_blocks--;
|
| 200 |
for (int j = i; j < alloc->n_free_blocks; j++) {
|
|
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
| 208 |
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
| 209 |
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
| 210 |
int insert_pos = 0;
|
| 211 |
-
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].
|
| 212 |
insert_pos++;
|
| 213 |
}
|
| 214 |
// shift all blocks from insert_pos onward to make room for the new block
|
|
@@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
| 216 |
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
| 217 |
}
|
| 218 |
// insert the new block
|
| 219 |
-
alloc->free_blocks[insert_pos].
|
| 220 |
alloc->free_blocks[insert_pos].size = size;
|
| 221 |
alloc->n_free_blocks++;
|
|
|
|
|
|
|
| 222 |
}
|
| 223 |
|
| 224 |
-
void
|
| 225 |
alloc->n_free_blocks = 1;
|
| 226 |
-
|
| 227 |
-
alloc->free_blocks[0].
|
| 228 |
-
|
| 229 |
-
if (alloc->measure) {
|
| 230 |
-
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
| 231 |
-
} else {
|
| 232 |
-
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
| 233 |
-
ggml_backend_buffer_reset(alloc->buffer);
|
| 234 |
-
}
|
| 235 |
}
|
| 236 |
|
| 237 |
-
|
| 238 |
-
struct
|
| 239 |
-
|
| 240 |
-
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
| 241 |
|
| 242 |
-
*alloc = (struct
|
| 243 |
-
/*.buffer = */ buffer,
|
| 244 |
-
/*.buffer_owned = */ true,
|
| 245 |
-
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
| 246 |
/*.alignment = */ alignment,
|
| 247 |
/*.n_free_blocks = */ 0,
|
| 248 |
/*.free_blocks = */ {{0}},
|
| 249 |
/*.max_size = */ 0,
|
| 250 |
-
/*.measure = */ false,
|
| 251 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 252 |
-
/*.allocated_tensors = */ {0},
|
| 253 |
#endif
|
| 254 |
};
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
return alloc;
|
| 259 |
-
}
|
| 260 |
-
|
| 261 |
-
ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
| 262 |
-
ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
|
| 263 |
-
alloc->measure = true;
|
| 264 |
|
| 265 |
return alloc;
|
| 266 |
}
|
| 267 |
|
| 268 |
-
|
| 269 |
-
// create a backend buffer to get the correct tensor allocation sizes
|
| 270 |
-
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
| 271 |
-
|
| 272 |
-
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
| 273 |
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
| 274 |
-
alloc->buffer_owned = true;
|
| 275 |
-
alloc->measure = true;
|
| 276 |
-
ggml_tallocr_reset(alloc);
|
| 277 |
-
return alloc;
|
| 278 |
-
}
|
| 279 |
-
|
| 280 |
-
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
| 281 |
-
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
| 282 |
-
}
|
| 283 |
-
|
| 284 |
-
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
| 285 |
-
// create a backend buffer to get the correct tensor allocation sizes
|
| 286 |
-
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
| 287 |
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
| 288 |
-
alloc->buffer_owned = true;
|
| 289 |
-
return alloc;
|
| 290 |
-
}
|
| 291 |
-
|
| 292 |
-
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
| 293 |
-
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
| 294 |
-
}
|
| 295 |
-
|
| 296 |
-
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
| 297 |
-
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
| 298 |
-
|
| 299 |
-
*alloc = (struct ggml_tallocr) {
|
| 300 |
-
/*.buffer = */ buffer,
|
| 301 |
-
/*.buffer_owned = */ false,
|
| 302 |
-
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
| 303 |
-
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
| 304 |
-
/*.n_free_blocks = */ 0,
|
| 305 |
-
/*.free_blocks = */ {{0}},
|
| 306 |
-
/*.max_size = */ 0,
|
| 307 |
-
/*.measure = */ false,
|
| 308 |
-
#ifdef GGML_ALLOCATOR_DEBUG
|
| 309 |
-
/*.allocated_tensors = */ {0},
|
| 310 |
-
#endif
|
| 311 |
-
};
|
| 312 |
-
|
| 313 |
-
ggml_tallocr_reset(alloc);
|
| 314 |
-
|
| 315 |
-
return alloc;
|
| 316 |
-
}
|
| 317 |
-
|
| 318 |
-
struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
|
| 319 |
-
return alloc->buffer;
|
| 320 |
-
}
|
| 321 |
-
|
| 322 |
-
void ggml_tallocr_free(ggml_tallocr_t alloc) {
|
| 323 |
-
if (alloc == NULL) {
|
| 324 |
-
return;
|
| 325 |
-
}
|
| 326 |
-
|
| 327 |
-
if (alloc->buffer_owned) {
|
| 328 |
-
ggml_backend_buffer_free(alloc->buffer);
|
| 329 |
-
}
|
| 330 |
free(alloc);
|
| 331 |
}
|
| 332 |
|
| 333 |
-
|
| 334 |
-
return alloc->
|
| 335 |
}
|
| 336 |
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
// to avoid this, we add a 10% margin to the buffer size
|
| 340 |
-
return alloc->max_size + alloc->max_size/10;
|
| 341 |
-
}
|
| 342 |
|
| 343 |
// graph allocator
|
| 344 |
|
| 345 |
struct hash_node {
|
| 346 |
int n_children;
|
| 347 |
int n_views;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
};
|
| 349 |
|
| 350 |
struct ggml_gallocr {
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
struct ggml_hash_set hash_set;
|
| 353 |
-
struct hash_node * hash_values;
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
int
|
| 357 |
-
int parse_seq_len;
|
| 358 |
};
|
| 359 |
|
| 360 |
-
ggml_gallocr_t
|
| 361 |
-
ggml_gallocr_t galloc = (ggml_gallocr_t)
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
return galloc;
|
| 374 |
}
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
| 377 |
if (galloc == NULL) {
|
| 378 |
return;
|
| 379 |
}
|
| 380 |
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
free(galloc->hash_allocs);
|
| 389 |
-
}
|
| 390 |
-
if (galloc->parse_seq != NULL) {
|
| 391 |
-
free(galloc->parse_seq);
|
| 392 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
free(galloc);
|
| 394 |
}
|
| 395 |
|
| 396 |
-
|
| 397 |
-
free(galloc->parse_seq);
|
| 398 |
-
galloc->parse_seq = malloc(sizeof(int) * n);
|
| 399 |
|
| 400 |
-
|
| 401 |
-
galloc->parse_seq[i] = list[i];
|
| 402 |
-
}
|
| 403 |
-
galloc->parse_seq_len = n;
|
| 404 |
-
}
|
| 405 |
-
|
| 406 |
-
static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
| 407 |
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
| 408 |
return &galloc->hash_values[i];
|
| 409 |
}
|
| 410 |
|
| 411 |
-
static bool
|
| 412 |
-
|
| 413 |
-
return false;
|
| 414 |
-
}
|
| 415 |
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 416 |
-
if (a->ne[i] != b->ne[i]) {
|
| 417 |
-
return false;
|
| 418 |
-
}
|
| 419 |
-
if (a->nb[i] != b->nb[i]) {
|
| 420 |
-
return false;
|
| 421 |
-
}
|
| 422 |
-
}
|
| 423 |
-
return true;
|
| 424 |
}
|
| 425 |
|
| 426 |
-
static
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
case GGML_OP_ADD:
|
| 432 |
-
case GGML_OP_ADD1:
|
| 433 |
-
case GGML_OP_SUB:
|
| 434 |
-
case GGML_OP_MUL:
|
| 435 |
-
case GGML_OP_DIV:
|
| 436 |
-
case GGML_OP_SQR:
|
| 437 |
-
case GGML_OP_SQRT:
|
| 438 |
-
case GGML_OP_LOG:
|
| 439 |
-
case GGML_OP_UNARY:
|
| 440 |
-
case GGML_OP_ROPE:
|
| 441 |
-
case GGML_OP_RMS_NORM:
|
| 442 |
-
case GGML_OP_SOFT_MAX:
|
| 443 |
-
return true;
|
| 444 |
-
|
| 445 |
-
default:
|
| 446 |
-
return false;
|
| 447 |
-
}
|
| 448 |
}
|
| 449 |
|
| 450 |
-
static
|
| 451 |
-
|
| 452 |
-
return galloc->talloc;
|
| 453 |
-
}
|
| 454 |
-
|
| 455 |
-
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
| 456 |
}
|
| 457 |
|
| 458 |
-
static void
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
| 462 |
-
if (update_backend) {
|
| 463 |
-
view->backend = view->view_src->backend;
|
| 464 |
-
}
|
| 465 |
-
// views are initialized in the alloc buffer rather than the view_src buffer
|
| 466 |
-
view->buffer = alloc->buffer;
|
| 467 |
-
view->data = (char *)view->view_src->data + view->view_offs;
|
| 468 |
|
| 469 |
-
|
|
|
|
|
|
|
| 470 |
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
|
|
|
|
|
|
|
|
|
| 475 |
|
| 476 |
-
|
| 477 |
-
|
|
|
|
|
|
|
|
|
|
| 478 |
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
if (ggml_op_can_inplace(node->op)) {
|
| 485 |
-
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 486 |
-
struct ggml_tensor * parent = node->src[i];
|
| 487 |
-
if (parent == NULL) {
|
| 488 |
-
break;
|
| 489 |
-
}
|
| 490 |
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
}
|
| 496 |
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
node->view_src = view_src;
|
| 510 |
-
view_src_hn->n_views += 1;
|
| 511 |
-
init_view(galloc, node, false);
|
| 512 |
-
return;
|
| 513 |
-
}
|
| 514 |
-
} else {
|
| 515 |
-
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
| 516 |
-
node->view_src = parent;
|
| 517 |
-
p_hn->n_views += 1;
|
| 518 |
-
init_view(galloc, node, false);
|
| 519 |
return;
|
| 520 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
}
|
| 522 |
}
|
| 523 |
}
|
| 524 |
-
ggml_tallocr_alloc(alloc, node);
|
| 525 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
}
|
| 527 |
}
|
| 528 |
|
| 529 |
-
static void
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
|
| 532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
}
|
| 534 |
|
| 535 |
-
static
|
| 536 |
-
|
| 537 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
// count number of children and views
|
| 540 |
-
for (int i = 0; i <
|
| 541 |
-
struct ggml_tensor * node =
|
| 542 |
|
| 543 |
if (ggml_is_view(node)) {
|
| 544 |
struct ggml_tensor * view_src = node->view_src;
|
| 545 |
-
|
| 546 |
-
if (node->buffer == NULL && node->data != NULL) {
|
| 547 |
-
// view of a pre-allocated tensor, didn't call init_view() yet
|
| 548 |
-
init_view(galloc, node, true);
|
| 549 |
-
}
|
| 550 |
}
|
| 551 |
|
| 552 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
@@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 554 |
if (parent == NULL) {
|
| 555 |
break;
|
| 556 |
}
|
| 557 |
-
|
| 558 |
-
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
| 559 |
-
init_view(galloc, parent, true);
|
| 560 |
-
}
|
| 561 |
}
|
| 562 |
}
|
| 563 |
|
| 564 |
// allocate tensors
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
// allocate parents (leafs)
|
| 576 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 577 |
-
struct ggml_tensor * parent = node->src[j];
|
| 578 |
-
if (parent == NULL) {
|
| 579 |
-
break;
|
| 580 |
-
}
|
| 581 |
-
allocate_node(galloc, parent);
|
| 582 |
}
|
|
|
|
|
|
|
| 583 |
|
| 584 |
-
|
| 585 |
-
|
| 586 |
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
}
|
| 597 |
}
|
| 598 |
-
AT_PRINTF("\n");
|
| 599 |
}
|
|
|
|
| 600 |
|
| 601 |
// update parents
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
struct hash_node *
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
if (ggml_is_view(parent)) {
|
| 623 |
-
struct ggml_tensor * view_src = parent->view_src;
|
| 624 |
-
struct hash_node * view_src_hn = hash_get(galloc, view_src);
|
| 625 |
-
view_src_hn->n_views -= 1;
|
| 626 |
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
| 627 |
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
|
| 628 |
-
free_node(galloc, view_src);
|
| 629 |
-
}
|
| 630 |
-
}
|
| 631 |
-
else {
|
| 632 |
-
free_node(galloc, parent);
|
| 633 |
-
}
|
| 634 |
}
|
| 635 |
}
|
|
|
|
|
|
|
|
|
|
| 636 |
}
|
| 637 |
AT_PRINTF("\n");
|
| 638 |
-
if (parse_seq_len) {
|
| 639 |
-
last_barrier_pos = ind + 1;
|
| 640 |
-
}
|
| 641 |
}
|
| 642 |
}
|
| 643 |
}
|
| 644 |
|
| 645 |
-
|
| 646 |
size_t hash_size = graph->visited_hash_table.size;
|
| 647 |
|
| 648 |
-
//
|
| 649 |
if (galloc->hash_set.size < hash_size) {
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
}
|
| 653 |
-
if (galloc->hash_values != NULL) {
|
| 654 |
-
free(galloc->hash_values);
|
| 655 |
-
}
|
| 656 |
-
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
|
| 657 |
galloc->hash_set.size = hash_size;
|
| 658 |
-
galloc->
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
}
|
| 660 |
|
| 661 |
-
// reset
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
galloc->talloc = talloc;
|
| 666 |
-
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
| 667 |
-
galloc->talloc = NULL;
|
| 668 |
-
|
| 669 |
-
size_t max_size = ggml_tallocr_max_size(talloc);
|
| 670 |
-
|
| 671 |
-
return max_size;
|
| 672 |
-
}
|
| 673 |
-
|
| 674 |
-
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
|
| 675 |
-
const size_t hash_size = hash_set.size;
|
| 676 |
-
|
| 677 |
-
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
| 678 |
|
| 679 |
-
|
|
|
|
| 680 |
|
| 681 |
-
//
|
| 682 |
-
if (galloc->
|
| 683 |
-
free(galloc->
|
| 684 |
-
galloc->
|
| 685 |
-
galloc->
|
| 686 |
}
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
}
|
| 692 |
-
galloc->hash_set = hash_set;
|
| 693 |
|
| 694 |
-
//
|
| 695 |
-
|
|
|
|
|
|
|
| 696 |
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
-
|
| 702 |
-
galloc->hash_set.keys = NULL;
|
| 703 |
-
galloc->hash_allocs = NULL;
|
| 704 |
}
|
| 705 |
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
struct ggml_allocr {
|
| 709 |
-
ggml_tallocr_t talloc;
|
| 710 |
-
ggml_gallocr_t galloc;
|
| 711 |
-
};
|
| 712 |
-
|
| 713 |
-
static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
|
| 714 |
-
ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
|
| 715 |
-
*alloc = (struct ggml_allocr) {
|
| 716 |
-
/*.talloc = */ talloc,
|
| 717 |
-
/*.galloc = */ ggml_gallocr_new(),
|
| 718 |
-
};
|
| 719 |
-
return alloc;
|
| 720 |
}
|
| 721 |
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
}
|
| 725 |
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
|
| 730 |
-
|
| 731 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 732 |
}
|
| 733 |
|
| 734 |
-
|
| 735 |
-
|
|
|
|
|
|
|
| 736 |
}
|
| 737 |
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
|
|
|
|
|
|
|
|
|
| 749 |
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
}
|
| 754 |
|
| 755 |
-
|
| 756 |
-
ggml_tallocr_free(alloc->talloc);
|
| 757 |
-
free(alloc);
|
| 758 |
}
|
| 759 |
|
| 760 |
-
bool
|
| 761 |
-
|
| 762 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 767 |
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
|
| 772 |
-
|
| 773 |
-
return ggml_tallocr_max_size(alloc->talloc);
|
| 774 |
}
|
| 775 |
|
| 776 |
-
size_t
|
| 777 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
}
|
| 779 |
|
| 780 |
// utils
|
|
@@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
| 795 |
return false;
|
| 796 |
}
|
| 797 |
|
| 798 |
-
|
| 799 |
|
| 800 |
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
| 801 |
if (t->data == NULL) {
|
| 802 |
if (t->view_src == NULL) {
|
| 803 |
ggml_tallocr_alloc(tallocr, t);
|
| 804 |
-
} else {
|
| 805 |
ggml_backend_view_init(buffer, t);
|
| 806 |
}
|
| 807 |
} else {
|
| 808 |
-
if (t->view_src != NULL) {
|
| 809 |
// view of a pre-allocated tensor
|
| 810 |
ggml_backend_view_init(buffer, t);
|
| 811 |
}
|
|
@@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 838 |
}
|
| 839 |
|
| 840 |
if (this_size > max_size) {
|
| 841 |
-
// tensor is too large to fit in a single buffer
|
| 842 |
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
| 843 |
__func__, t->name,
|
| 844 |
ggml_backend_buft_name(buft),
|
|
@@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 870 |
}
|
| 871 |
|
| 872 |
if (n_buffers == 0) {
|
| 873 |
-
// all the tensors in the context are already allocated
|
| 874 |
#ifndef NDEBUG
|
| 875 |
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
| 876 |
#endif
|
|
|
|
| 17 |
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
| 18 |
#define AT_PRINTF(...)
|
| 19 |
|
| 20 |
+
|
| 21 |
+
static bool ggml_is_view(const struct ggml_tensor * t) {
|
| 22 |
+
return t->view_src != NULL;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
| 26 |
+
if (a->type != b->type) {
|
| 27 |
+
return false;
|
| 28 |
+
}
|
| 29 |
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 30 |
+
if (a->ne[i] != b->ne[i]) {
|
| 31 |
+
return false;
|
| 32 |
+
}
|
| 33 |
+
if (a->nb[i] != b->nb[i]) {
|
| 34 |
+
return false;
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
return true;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
static bool ggml_op_can_inplace(enum ggml_op op) {
|
| 41 |
+
switch (op) {
|
| 42 |
+
case GGML_OP_SCALE:
|
| 43 |
+
case GGML_OP_DIAG_MASK_ZERO:
|
| 44 |
+
case GGML_OP_DIAG_MASK_INF:
|
| 45 |
+
case GGML_OP_ADD:
|
| 46 |
+
case GGML_OP_ADD1:
|
| 47 |
+
case GGML_OP_SUB:
|
| 48 |
+
case GGML_OP_MUL:
|
| 49 |
+
case GGML_OP_DIV:
|
| 50 |
+
case GGML_OP_SQR:
|
| 51 |
+
case GGML_OP_SQRT:
|
| 52 |
+
case GGML_OP_LOG:
|
| 53 |
+
case GGML_OP_UNARY:
|
| 54 |
+
case GGML_OP_ROPE:
|
| 55 |
+
case GGML_OP_RMS_NORM:
|
| 56 |
+
case GGML_OP_SOFT_MAX:
|
| 57 |
+
return true;
|
| 58 |
+
|
| 59 |
+
default:
|
| 60 |
+
return false;
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
// TODO: GGML_PAD ?
|
| 65 |
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
| 66 |
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
|
|
|
| 68 |
return offset + align;
|
| 69 |
}
|
| 70 |
|
| 71 |
+
// tallocr
|
| 72 |
+
struct ggml_tallocr {
|
| 73 |
+
ggml_backend_buffer_t buffer;
|
| 74 |
+
void * base;
|
| 75 |
+
size_t alignment;
|
| 76 |
+
size_t offset;
|
| 77 |
+
};
|
| 78 |
+
|
| 79 |
+
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
| 80 |
+
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
|
| 81 |
+
if (talloc == NULL) {
|
| 82 |
+
return NULL;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
void * base = ggml_backend_buffer_get_base(buffer);
|
| 86 |
+
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
| 87 |
+
|
| 88 |
+
assert(align && !(align & (align - 1))); // power of 2
|
| 89 |
+
|
| 90 |
+
*talloc = (struct ggml_tallocr) {
|
| 91 |
+
/*.buffer = */ buffer,
|
| 92 |
+
/*.base = */ base,
|
| 93 |
+
/*.alignment = */ align,
|
| 94 |
+
/*.offset = */ aligned_offset(base, 0, align),
|
| 95 |
+
};
|
| 96 |
+
return talloc;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
void ggml_tallocr_free(ggml_tallocr_t talloc) {
|
| 100 |
+
free(talloc);
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
|
| 104 |
+
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
| 105 |
+
size = GGML_PAD(size, talloc->alignment);
|
| 106 |
+
|
| 107 |
+
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
| 108 |
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
| 109 |
+
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
| 110 |
+
GGML_ASSERT(!"not enough space in the buffer");
|
| 111 |
+
return;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
| 115 |
+
talloc->offset += size;
|
| 116 |
+
|
| 117 |
+
assert(((uintptr_t)addr % talloc->alignment) == 0);
|
| 118 |
+
|
| 119 |
+
ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
// dynamic tensor allocator
|
| 123 |
+
|
| 124 |
struct free_block {
|
| 125 |
+
size_t offset;
|
| 126 |
size_t size;
|
| 127 |
};
|
| 128 |
|
| 129 |
+
struct ggml_dyn_tallocr {
|
|
|
|
|
|
|
|
|
|
| 130 |
size_t alignment;
|
|
|
|
| 131 |
int n_free_blocks;
|
| 132 |
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
|
|
|
| 133 |
size_t max_size;
|
| 134 |
|
|
|
|
|
|
|
| 135 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 136 |
+
struct {
|
| 137 |
+
const struct ggml_tensor * tensor;
|
| 138 |
+
size_t offset;
|
| 139 |
+
} allocated_tensors[1024];
|
| 140 |
#endif
|
| 141 |
};
|
| 142 |
|
| 143 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 144 |
+
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
| 145 |
for (int i = 0; i < 1024; i++) {
|
| 146 |
+
if (alloc->allocated_tensors[i].tensor == NULL) {
|
| 147 |
+
alloc->allocated_tensors[i].tensor = tensor;
|
| 148 |
+
alloc->allocated_tensors[i].offset = offset;
|
| 149 |
return;
|
| 150 |
}
|
| 151 |
}
|
| 152 |
GGML_ASSERT(!"out of allocated_tensors");
|
| 153 |
}
|
| 154 |
+
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
| 155 |
for (int i = 0; i < 1024; i++) {
|
| 156 |
+
if (alloc->allocated_tensors[i].offset == offset) {
|
| 157 |
+
alloc->allocated_tensors[i].tensor = NULL;
|
|
|
|
| 158 |
return;
|
| 159 |
}
|
| 160 |
}
|
| 161 |
+
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
|
| 162 |
GGML_ASSERT(!"tensor not found");
|
| 163 |
}
|
| 164 |
#endif
|
| 165 |
|
| 166 |
+
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
size = aligned_offset(NULL, size, alloc->alignment);
|
| 168 |
|
| 169 |
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
|
|
|
| 189 |
if (block->size >= size) {
|
| 190 |
best_fit_block = alloc->n_free_blocks - 1;
|
| 191 |
} else {
|
| 192 |
+
// this should never happen
|
| 193 |
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
| 194 |
+
__func__, size, max_avail);
|
| 195 |
GGML_ASSERT(!"not enough space in the buffer");
|
| 196 |
+
GGML_UNREACHABLE();
|
| 197 |
}
|
| 198 |
}
|
| 199 |
|
| 200 |
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
| 201 |
+
size_t offset = block->offset;
|
| 202 |
+
block->offset = offset + size;
|
| 203 |
block->size -= size;
|
| 204 |
if (block->size == 0) {
|
| 205 |
// remove block if empty
|
|
|
|
| 209 |
}
|
| 210 |
}
|
| 211 |
|
| 212 |
+
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 215 |
+
add_allocated_tensor(alloc, offset, tensor);
|
| 216 |
+
size_t cur_max = offset + size;
|
| 217 |
if (cur_max > alloc->max_size) {
|
| 218 |
+
// sort allocated_tensors by offset
|
| 219 |
+
for (int i = 0; i < 1024; i++) {
|
| 220 |
+
for (int j = i + 1; j < 1024; j++) {
|
| 221 |
+
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
|
| 222 |
+
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
| 223 |
+
size_t tmp_offset = alloc->allocated_tensors[i].offset;
|
| 224 |
+
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
| 225 |
+
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
|
| 226 |
+
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
| 227 |
+
alloc->allocated_tensors[j].offset = tmp_offset;
|
| 228 |
+
}
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
| 232 |
for (int i = 0; i < 1024; i++) {
|
| 233 |
+
if (alloc->allocated_tensors[i].tensor) {
|
| 234 |
+
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
| 235 |
+
alloc->allocated_tensors[i].offset,
|
| 236 |
+
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
| 237 |
+
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
| 238 |
}
|
| 239 |
}
|
| 240 |
+
fprintf(stderr, "\n");
|
| 241 |
}
|
| 242 |
#endif
|
| 243 |
|
| 244 |
+
alloc->max_size = MAX(alloc->max_size, offset + size);
|
|
|
|
| 245 |
|
| 246 |
+
return offset;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
+
GGML_UNUSED(tensor);
|
| 249 |
+
}
|
| 250 |
|
| 251 |
+
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
| 252 |
+
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
|
| 253 |
size = aligned_offset(NULL, size, alloc->alignment);
|
| 254 |
+
|
| 255 |
+
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
|
| 256 |
|
| 257 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 258 |
+
remove_allocated_tensor(alloc, offset, tensor);
|
| 259 |
#endif
|
| 260 |
|
| 261 |
// see if we can merge with an existing block
|
| 262 |
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
| 263 |
struct free_block * block = &alloc->free_blocks[i];
|
| 264 |
// check if ptr is at the end of the block
|
| 265 |
+
if (block->offset + block->size == offset) {
|
| 266 |
block->size += size;
|
| 267 |
// check if we can merge with the next block
|
| 268 |
+
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
|
| 269 |
block->size += alloc->free_blocks[i+1].size;
|
| 270 |
alloc->n_free_blocks--;
|
| 271 |
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
|
|
|
| 275 |
return;
|
| 276 |
}
|
| 277 |
// check if ptr is at the beginning of the block
|
| 278 |
+
if (offset + size == block->offset) {
|
| 279 |
+
block->offset = offset;
|
| 280 |
block->size += size;
|
| 281 |
// check if we can merge with the previous block
|
| 282 |
+
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
|
| 283 |
alloc->free_blocks[i-1].size += block->size;
|
| 284 |
alloc->n_free_blocks--;
|
| 285 |
for (int j = i; j < alloc->n_free_blocks; j++) {
|
|
|
|
| 293 |
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
| 294 |
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
| 295 |
int insert_pos = 0;
|
| 296 |
+
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
|
| 297 |
insert_pos++;
|
| 298 |
}
|
| 299 |
// shift all blocks from insert_pos onward to make room for the new block
|
|
|
|
| 301 |
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
| 302 |
}
|
| 303 |
// insert the new block
|
| 304 |
+
alloc->free_blocks[insert_pos].offset = offset;
|
| 305 |
alloc->free_blocks[insert_pos].size = size;
|
| 306 |
alloc->n_free_blocks++;
|
| 307 |
+
|
| 308 |
+
GGML_UNUSED(tensor);
|
| 309 |
}
|
| 310 |
|
| 311 |
+
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
| 312 |
alloc->n_free_blocks = 1;
|
| 313 |
+
alloc->free_blocks[0].offset = 0;
|
| 314 |
+
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
| 315 |
+
alloc->max_size = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
}
|
| 317 |
|
| 318 |
+
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
| 319 |
+
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
|
|
|
|
|
|
|
| 320 |
|
| 321 |
+
*alloc = (struct ggml_dyn_tallocr) {
|
|
|
|
|
|
|
|
|
|
| 322 |
/*.alignment = */ alignment,
|
| 323 |
/*.n_free_blocks = */ 0,
|
| 324 |
/*.free_blocks = */ {{0}},
|
| 325 |
/*.max_size = */ 0,
|
|
|
|
| 326 |
#ifdef GGML_ALLOCATOR_DEBUG
|
| 327 |
+
/*.allocated_tensors = */ {{0}},
|
| 328 |
#endif
|
| 329 |
};
|
| 330 |
|
| 331 |
+
ggml_dyn_tallocr_reset(alloc);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
return alloc;
|
| 334 |
}
|
| 335 |
|
| 336 |
+
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
free(alloc);
|
| 338 |
}
|
| 339 |
|
| 340 |
+
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
| 341 |
+
return alloc->max_size;
|
| 342 |
}
|
| 343 |
|
| 344 |
+
|
| 345 |
+
/////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
// graph allocator
|
| 348 |
|
| 349 |
struct hash_node {
|
| 350 |
int n_children;
|
| 351 |
int n_views;
|
| 352 |
+
int buffer_id;
|
| 353 |
+
size_t offset; // offset within the buffer
|
| 354 |
+
bool allocated;
|
| 355 |
+
};
|
| 356 |
+
|
| 357 |
+
//
|
| 358 |
+
struct tensor_alloc {
|
| 359 |
+
size_t offset;
|
| 360 |
+
size_t size_max; // 0 = pre-allocated, unused, or view
|
| 361 |
+
};
|
| 362 |
+
|
| 363 |
+
struct node_alloc {
|
| 364 |
+
int buffer_id;
|
| 365 |
+
struct tensor_alloc dst;
|
| 366 |
+
struct tensor_alloc src[GGML_MAX_SRC];
|
| 367 |
};
|
| 368 |
|
| 369 |
struct ggml_gallocr {
|
| 370 |
+
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
| 371 |
+
ggml_backend_buffer_t * buffers; // [n_buffers]
|
| 372 |
+
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
| 373 |
+
int n_buffers;
|
| 374 |
+
|
| 375 |
struct ggml_hash_set hash_set;
|
| 376 |
+
struct hash_node * hash_values; // [hash_set.size]
|
| 377 |
+
|
| 378 |
+
struct node_alloc * node_allocs; // [n_nodes]
|
| 379 |
+
int n_nodes;
|
|
|
|
| 380 |
};
|
| 381 |
|
| 382 |
+
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
| 383 |
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
|
| 384 |
+
GGML_ASSERT(galloc != NULL);
|
| 385 |
+
|
| 386 |
+
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
|
| 387 |
+
GGML_ASSERT(galloc->bufts != NULL);
|
| 388 |
+
|
| 389 |
+
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
|
| 390 |
+
GGML_ASSERT(galloc->buffers != NULL);
|
| 391 |
+
|
| 392 |
+
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
|
| 393 |
+
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
| 394 |
+
|
| 395 |
+
for (int i = 0; i < n_bufs; i++) {
|
| 396 |
+
galloc->bufts[i] = bufts[i];
|
| 397 |
+
galloc->buffers[i] = NULL;
|
| 398 |
+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
| 399 |
+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
| 400 |
+
}
|
| 401 |
+
galloc->n_buffers = n_bufs;
|
| 402 |
|
| 403 |
return galloc;
|
| 404 |
}
|
| 405 |
|
| 406 |
+
ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
|
| 407 |
+
return ggml_gallocr_new_n(&buft, 1);
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
| 411 |
if (galloc == NULL) {
|
| 412 |
return;
|
| 413 |
}
|
| 414 |
|
| 415 |
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 416 |
+
if (galloc->buffers != NULL) {
|
| 417 |
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
| 418 |
+
}
|
| 419 |
+
if (galloc->buf_tallocs != NULL) {
|
| 420 |
+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
| 421 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
}
|
| 423 |
+
|
| 424 |
+
free(galloc->hash_set.keys);
|
| 425 |
+
free(galloc->hash_values);
|
| 426 |
+
free(galloc->bufts);
|
| 427 |
+
free(galloc->buffers);
|
| 428 |
+
free(galloc->buf_tallocs);
|
| 429 |
+
free(galloc->node_allocs);
|
| 430 |
free(galloc);
|
| 431 |
}
|
| 432 |
|
| 433 |
+
typedef struct ggml_gallocr * ggml_gallocr_t;
|
|
|
|
|
|
|
| 434 |
|
| 435 |
+
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
| 437 |
return &galloc->hash_values[i];
|
| 438 |
}
|
| 439 |
|
| 440 |
+
static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
| 441 |
+
return ggml_gallocr_hash_get(galloc, t)->allocated;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
}
|
| 443 |
|
| 444 |
+
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
|
| 445 |
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
| 446 |
+
hn->buffer_id = buffer_id;
|
| 447 |
+
hn->offset = offset;
|
| 448 |
+
hn->allocated = true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
}
|
| 450 |
|
| 451 |
+
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
| 452 |
+
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
}
|
| 454 |
|
| 455 |
+
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
| 456 |
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
+
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
| 459 |
+
hn->allocated = true;
|
| 460 |
+
assert(hn->offset == 0);
|
| 461 |
|
| 462 |
+
// try to reuse a parent's buffer (inplace)
|
| 463 |
+
if (ggml_op_can_inplace(node->op)) {
|
| 464 |
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 465 |
+
struct ggml_tensor * parent = node->src[i];
|
| 466 |
+
if (parent == NULL) {
|
| 467 |
+
break;
|
| 468 |
+
}
|
| 469 |
|
| 470 |
+
// if the node's data is external, then we cannot re-use it
|
| 471 |
+
if (!ggml_gallocr_is_own(galloc, parent)) {
|
| 472 |
+
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
| 473 |
+
continue;
|
| 474 |
+
}
|
| 475 |
|
| 476 |
+
// outputs cannot be reused
|
| 477 |
+
if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
|
| 478 |
+
AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
|
| 479 |
+
continue;
|
| 480 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
+
if (!ggml_are_same_layout(node, parent)) {
|
| 483 |
+
AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
|
| 484 |
+
continue;
|
| 485 |
+
}
|
|
|
|
| 486 |
|
| 487 |
+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
| 488 |
+
if (p_hn->n_children == 1 && p_hn->n_views == 0) {
|
| 489 |
+
if (ggml_is_view(parent)) {
|
| 490 |
+
struct ggml_tensor * view_src = parent->view_src;
|
| 491 |
+
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
| 492 |
+
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
| 493 |
+
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
| 494 |
+
assert(view_src_hn->offset == p_hn->offset);
|
| 495 |
+
hn->buffer_id = p_hn->buffer_id;
|
| 496 |
+
hn->offset = p_hn->offset;
|
| 497 |
+
p_hn->allocated = false; // avoid freeing the parent
|
| 498 |
+
view_src_hn->allocated = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
return;
|
| 500 |
}
|
| 501 |
+
} else {
|
| 502 |
+
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
| 503 |
+
hn->buffer_id = p_hn->buffer_id;
|
| 504 |
+
hn->offset = p_hn->offset;
|
| 505 |
+
p_hn->allocated = false; // avoid freeing the parent
|
| 506 |
+
return;
|
| 507 |
}
|
| 508 |
}
|
| 509 |
}
|
|
|
|
| 510 |
}
|
| 511 |
+
// allocate tensor from the buffer
|
| 512 |
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
| 513 |
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
| 514 |
+
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
| 515 |
+
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
|
| 516 |
+
hn->buffer_id = buffer_id;
|
| 517 |
+
hn->offset = offset;
|
| 518 |
+
return;
|
| 519 |
}
|
| 520 |
}
|
| 521 |
|
| 522 |
+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
| 523 |
+
// graph outputs are never freed
|
| 524 |
+
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
| 525 |
+
AT_PRINTF("not freeing output %s\n", node->name);
|
| 526 |
+
return;
|
| 527 |
+
}
|
| 528 |
|
| 529 |
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
| 530 |
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
| 531 |
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
| 532 |
+
size_t offset = hn->offset;
|
| 533 |
+
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
| 534 |
+
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
| 535 |
+
hn->allocated = false;
|
| 536 |
}
|
| 537 |
|
| 538 |
+
static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
| 539 |
+
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
| 543 |
+
// clear hash tables
|
| 544 |
+
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
| 545 |
+
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
| 546 |
+
|
| 547 |
+
// allocate all graph inputs first to avoid overwriting them
|
| 548 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 549 |
+
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 550 |
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
| 551 |
+
}
|
| 552 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 553 |
+
if (graph->nodes[i]->src[j] == NULL) {
|
| 554 |
+
break;
|
| 555 |
+
}
|
| 556 |
+
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 557 |
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
| 558 |
+
}
|
| 559 |
+
}
|
| 560 |
+
}
|
| 561 |
|
| 562 |
// count number of children and views
|
| 563 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 564 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 565 |
|
| 566 |
if (ggml_is_view(node)) {
|
| 567 |
struct ggml_tensor * view_src = node->view_src;
|
| 568 |
+
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
}
|
| 570 |
|
| 571 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
|
|
| 573 |
if (parent == NULL) {
|
| 574 |
break;
|
| 575 |
}
|
| 576 |
+
ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
|
|
|
|
|
|
|
|
|
|
| 577 |
}
|
| 578 |
}
|
| 579 |
|
| 580 |
// allocate tensors
|
| 581 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 582 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 583 |
+
int buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
| 584 |
+
|
| 585 |
+
// allocate parents (only leafs need to be allocated at this point)
|
| 586 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 587 |
+
struct ggml_tensor * parent = node->src[j];
|
| 588 |
+
if (parent == NULL) {
|
| 589 |
+
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
}
|
| 591 |
+
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
| 592 |
+
}
|
| 593 |
|
| 594 |
+
// allocate node
|
| 595 |
+
ggml_gallocr_allocate_node(galloc, node, buffer_id);
|
| 596 |
|
| 597 |
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
|
| 598 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 599 |
+
struct ggml_tensor * parent = node->src[j];
|
| 600 |
+
if (parent == NULL) {
|
| 601 |
+
break;
|
| 602 |
+
}
|
| 603 |
+
AT_PRINTF("%s", parent->name);
|
| 604 |
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
| 605 |
+
AT_PRINTF(", ");
|
|
|
|
| 606 |
}
|
|
|
|
| 607 |
}
|
| 608 |
+
AT_PRINTF("\n");
|
| 609 |
|
| 610 |
// update parents
|
| 611 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 612 |
+
struct ggml_tensor * parent = node->src[j];
|
| 613 |
+
if (parent == NULL) {
|
| 614 |
+
break;
|
| 615 |
+
}
|
| 616 |
+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
| 617 |
+
p_hn->n_children -= 1;
|
| 618 |
+
|
| 619 |
+
AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
|
| 620 |
+
parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
|
| 621 |
+
|
| 622 |
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
| 623 |
+
if (ggml_is_view(parent)) {
|
| 624 |
+
struct ggml_tensor * view_src = parent->view_src;
|
| 625 |
+
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
| 626 |
+
view_src_hn->n_views -= 1;
|
| 627 |
+
AT_PRINTF("view_src %s: %d children, %d views\n",
|
| 628 |
+
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
| 629 |
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
| 630 |
+
ggml_gallocr_free_node(galloc, view_src, buffer_id);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
}
|
| 632 |
}
|
| 633 |
+
else if (p_hn->allocated) {
|
| 634 |
+
ggml_gallocr_free_node(galloc, parent, buffer_id);
|
| 635 |
+
}
|
| 636 |
}
|
| 637 |
AT_PRINTF("\n");
|
|
|
|
|
|
|
|
|
|
| 638 |
}
|
| 639 |
}
|
| 640 |
}
|
| 641 |
|
| 642 |
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
| 643 |
size_t hash_size = graph->visited_hash_table.size;
|
| 644 |
|
| 645 |
+
// initialize hash table
|
| 646 |
if (galloc->hash_set.size < hash_size) {
|
| 647 |
+
free(galloc->hash_set.keys);
|
| 648 |
+
free(galloc->hash_values);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
galloc->hash_set.size = hash_size;
|
| 650 |
+
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
|
| 651 |
+
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
|
| 652 |
+
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
| 653 |
+
GGML_ASSERT(galloc->hash_values != NULL);
|
| 654 |
+
} else {
|
| 655 |
+
// reset hash table
|
| 656 |
+
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
|
| 657 |
+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
| 658 |
}
|
| 659 |
|
| 660 |
+
// reset allocators
|
| 661 |
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 662 |
+
ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
|
| 663 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
|
| 665 |
+
// allocate in hash table
|
| 666 |
+
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
|
| 667 |
|
| 668 |
+
// set the node_allocs from the hash table
|
| 669 |
+
if (galloc->n_nodes < graph->n_nodes) {
|
| 670 |
+
free(galloc->node_allocs);
|
| 671 |
+
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
|
| 672 |
+
GGML_ASSERT(galloc->node_allocs != NULL);
|
| 673 |
}
|
| 674 |
+
galloc->n_nodes = graph->n_nodes;
|
| 675 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 676 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 677 |
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
| 678 |
+
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
| 679 |
+
if (node->view_src || node->data) {
|
| 680 |
+
node_alloc->dst.offset = SIZE_MAX;
|
| 681 |
+
node_alloc->dst.size_max = 0;
|
| 682 |
+
} else {
|
| 683 |
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
| 684 |
+
node_alloc->dst.offset = hn->offset;
|
| 685 |
+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
| 686 |
+
}
|
| 687 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 688 |
+
struct ggml_tensor * src = node->src[j];
|
| 689 |
+
if (!src || src->view_src || src->data) {
|
| 690 |
+
node_alloc->src[j].offset = SIZE_MAX;
|
| 691 |
+
node_alloc->src[j].size_max = 0;
|
| 692 |
+
} else {
|
| 693 |
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
| 694 |
+
node_alloc->src[j].offset = hn->offset;
|
| 695 |
+
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
| 696 |
+
}
|
| 697 |
+
}
|
| 698 |
}
|
|
|
|
| 699 |
|
| 700 |
+
// reallocate buffers if needed
|
| 701 |
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 702 |
+
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
| 703 |
+
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
| 704 |
|
| 705 |
+
if (new_size > cur_size) {
|
| 706 |
+
#ifndef NDEBUG
|
| 707 |
+
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
| 708 |
+
#endif
|
| 709 |
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
| 710 |
+
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
| 711 |
+
if (galloc->buffers[i] == NULL) {
|
| 712 |
+
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
| 713 |
+
return false;
|
| 714 |
+
}
|
| 715 |
+
}
|
| 716 |
+
}
|
| 717 |
|
| 718 |
+
return true;
|
|
|
|
|
|
|
| 719 |
}
|
| 720 |
|
| 721 |
+
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
| 722 |
+
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
}
|
| 724 |
|
| 725 |
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
|
| 726 |
+
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
|
|
|
| 727 |
|
| 728 |
+
if (node->view_src != NULL) {
|
| 729 |
+
if (node->buffer == NULL) {
|
| 730 |
+
assert(tensor_alloc->offset == SIZE_MAX);
|
| 731 |
+
if (node->view_src->buffer == NULL) {
|
| 732 |
+
// this tensor was allocated without ggml-backend
|
| 733 |
+
return;
|
| 734 |
+
}
|
| 735 |
+
ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
|
| 736 |
+
}
|
| 737 |
+
} else {
|
| 738 |
+
if (node->data == NULL) {
|
| 739 |
+
assert(tensor_alloc->offset != SIZE_MAX);
|
| 740 |
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
| 741 |
+
void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
|
| 742 |
+
void * addr = (char *)base + tensor_alloc->offset;
|
| 743 |
+
ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
|
| 744 |
+
} else {
|
| 745 |
+
if (node->buffer == NULL) {
|
| 746 |
+
// this tensor was allocated without ggml-backend
|
| 747 |
+
return;
|
| 748 |
+
}
|
| 749 |
|
| 750 |
+
#ifndef NDEBUG
|
| 751 |
+
size_t offset =
|
| 752 |
+
(char *)node->data -
|
| 753 |
+
(char *)ggml_backend_buffer_get_base(node->buffer);
|
| 754 |
+
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
| 755 |
+
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
| 756 |
+
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
| 757 |
+
#endif
|
| 758 |
+
}
|
| 759 |
+
}
|
| 760 |
}
|
| 761 |
|
| 762 |
+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
|
| 763 |
+
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
|
| 764 |
+
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
| 765 |
+
return talloc->size_max >= node_size;
|
| 766 |
}
|
| 767 |
|
| 768 |
+
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
| 769 |
+
if (galloc->n_nodes != graph->n_nodes) {
|
| 770 |
+
#ifndef NDEBUG
|
| 771 |
+
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
|
| 772 |
+
#endif
|
| 773 |
+
return true;
|
| 774 |
+
}
|
| 775 |
|
| 776 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 777 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 778 |
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
| 779 |
|
| 780 |
+
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
|
| 781 |
+
#ifndef NDEBUG
|
| 782 |
+
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
| 783 |
+
#endif
|
| 784 |
+
return true;
|
| 785 |
+
}
|
| 786 |
|
| 787 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 788 |
+
struct ggml_tensor * src = node->src[j];
|
| 789 |
+
if (src == NULL) {
|
| 790 |
+
break;
|
| 791 |
+
}
|
| 792 |
+
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
| 793 |
+
#ifndef NDEBUG
|
| 794 |
+
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
| 795 |
+
#endif
|
| 796 |
+
return true;
|
| 797 |
+
}
|
| 798 |
+
}
|
| 799 |
}
|
| 800 |
|
| 801 |
+
return false;
|
|
|
|
|
|
|
| 802 |
}
|
| 803 |
|
| 804 |
+
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
| 805 |
+
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
| 806 |
+
if (galloc->n_buffers == 1) {
|
| 807 |
+
#ifndef NDEBUG
|
| 808 |
+
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
|
| 809 |
+
#endif
|
| 810 |
+
if (!ggml_gallocr_reserve(galloc, graph)) {
|
| 811 |
+
return false;
|
| 812 |
+
}
|
| 813 |
+
} else {
|
| 814 |
+
#ifndef NDEBUG
|
| 815 |
+
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
| 816 |
+
#endif
|
| 817 |
+
return false;
|
| 818 |
+
}
|
| 819 |
+
}
|
| 820 |
|
| 821 |
+
// reset buffers
|
| 822 |
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 823 |
+
// zero size buffers are not allocated
|
| 824 |
+
if (galloc->buffers[i] != NULL) {
|
| 825 |
+
ggml_backend_buffer_reset(galloc->buffers[i]);
|
| 826 |
+
}
|
| 827 |
+
}
|
| 828 |
|
| 829 |
+
// allocate the graph tensors from the previous assignments
|
| 830 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 831 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 832 |
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
| 833 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 834 |
+
struct ggml_tensor * src = node->src[j];
|
| 835 |
+
if (src == NULL) {
|
| 836 |
+
break;
|
| 837 |
+
}
|
| 838 |
+
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
| 839 |
+
}
|
| 840 |
+
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
| 841 |
+
}
|
| 842 |
|
| 843 |
+
return true;
|
|
|
|
| 844 |
}
|
| 845 |
|
| 846 |
+
size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
| 847 |
+
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
| 848 |
+
|
| 849 |
+
if (galloc->buffers[buffer_id] == NULL) {
|
| 850 |
+
return 0;
|
| 851 |
+
}
|
| 852 |
+
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
| 853 |
}
|
| 854 |
|
| 855 |
// utils
|
|
|
|
| 870 |
return false;
|
| 871 |
}
|
| 872 |
|
| 873 |
+
struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
|
| 874 |
|
| 875 |
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
| 876 |
if (t->data == NULL) {
|
| 877 |
if (t->view_src == NULL) {
|
| 878 |
ggml_tallocr_alloc(tallocr, t);
|
| 879 |
+
} else if (t->buffer == NULL) {
|
| 880 |
ggml_backend_view_init(buffer, t);
|
| 881 |
}
|
| 882 |
} else {
|
| 883 |
+
if (t->view_src != NULL && t->buffer == NULL) {
|
| 884 |
// view of a pre-allocated tensor
|
| 885 |
ggml_backend_view_init(buffer, t);
|
| 886 |
}
|
|
|
|
| 913 |
}
|
| 914 |
|
| 915 |
if (this_size > max_size) {
|
|
|
|
| 916 |
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
| 917 |
__func__, t->name,
|
| 918 |
ggml_backend_buft_name(buft),
|
|
|
|
| 944 |
}
|
| 945 |
|
| 946 |
if (n_buffers == 0) {
|
|
|
|
| 947 |
#ifndef NDEBUG
|
| 948 |
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
| 949 |
#endif
|
ggml-alloc.h
CHANGED
|
@@ -6,88 +6,62 @@
|
|
| 6 |
extern "C" {
|
| 7 |
#endif
|
| 8 |
|
| 9 |
-
struct
|
| 10 |
-
struct ggml_backend_buffer;
|
| 11 |
-
struct
|
| 12 |
|
| 13 |
-
//
|
| 14 |
-
|
| 15 |
-
//
|
| 16 |
-
|
| 17 |
-
typedef struct ggml_allocr * ggml_allocr_t;
|
| 18 |
-
|
| 19 |
-
// initialize allocator for use with CPU backend only
|
| 20 |
-
GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
|
| 21 |
-
GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
|
| 22 |
-
|
| 23 |
-
// initialize allocator for use with ggml-backend
|
| 24 |
-
GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
| 25 |
-
GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
| 26 |
-
GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
|
| 27 |
-
|
| 28 |
-
GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
|
| 29 |
-
|
| 30 |
-
// tell the allocator to parse nodes following the order described in the list
|
| 31 |
-
// you should call this if your graph are optimized to execute out-of-order
|
| 32 |
-
GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
|
| 33 |
-
|
| 34 |
-
GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
|
| 35 |
-
GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
|
| 36 |
-
GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
|
| 37 |
-
GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
|
| 38 |
-
GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
|
| 39 |
-
|
| 40 |
-
GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
|
| 46 |
-
//
|
| 47 |
-
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
-
//
|
| 51 |
-
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
| 57 |
-
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
| 58 |
-
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
|
| 59 |
-
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
| 67 |
-
GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
|
| 68 |
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
// Graph allocator
|
| 71 |
typedef struct ggml_gallocr * ggml_gallocr_t;
|
| 72 |
|
| 73 |
-
GGML_API ggml_gallocr_t ggml_gallocr_new(
|
| 74 |
-
GGML_API
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
//
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
struct ggml_cgraph * graph,
|
| 83 |
-
struct ggml_hash_set hash_set,
|
| 84 |
-
ggml_tallocr_t * hash_node_talloc);
|
| 85 |
|
|
|
|
| 86 |
|
| 87 |
// Utils
|
| 88 |
// Create a buffer and allocate all the tensors in a ggml_context
|
| 89 |
-
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx,
|
| 90 |
-
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx,
|
| 91 |
|
| 92 |
#ifdef __cplusplus
|
| 93 |
}
|
|
|
|
| 6 |
extern "C" {
|
| 7 |
#endif
|
| 8 |
|
| 9 |
+
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
| 10 |
+
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
| 11 |
+
typedef struct ggml_backend * ggml_backend_t;
|
| 12 |
|
| 13 |
+
// Tensor allocator
|
| 14 |
+
typedef struct ggml_tallocr * ggml_tallocr_t;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
| 17 |
+
GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
|
| 18 |
+
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
| 19 |
|
| 20 |
+
// Graph allocator
|
| 21 |
+
/*
|
| 22 |
+
Example usage:
|
| 23 |
+
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
|
| 24 |
|
| 25 |
+
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
| 26 |
+
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
| 27 |
|
| 28 |
+
// allocate the graph
|
| 29 |
+
struct ggml_cgraph * graph = build_graph(batch);
|
| 30 |
+
ggml_gallocr_alloc_graph(galloc, graph);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
|
| 33 |
|
| 34 |
+
// evaluate the graph
|
| 35 |
+
ggml_backend_graph_compute(backend, graph);
|
| 36 |
+
*/
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
// special tensor flags for use with the graph allocator:
|
| 39 |
+
// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
|
| 40 |
+
// ggml_set_output(): output tensors are never freed and never overwritten
|
| 41 |
|
|
|
|
| 42 |
typedef struct ggml_gallocr * ggml_gallocr_t;
|
| 43 |
|
| 44 |
+
GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
|
| 45 |
+
GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
|
| 46 |
+
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
| 47 |
|
| 48 |
+
// pre-allocate buffers from a measure graph - does not allocate or modify the graph
|
| 49 |
+
// call with a worst-case graph to avoid buffer reallocations
|
| 50 |
+
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
| 51 |
+
// returns false if the buffer allocation failed
|
| 52 |
+
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
| 53 |
+
GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
|
| 54 |
|
| 55 |
+
// automatic reallocation if the topology changes when using a single buffer
|
| 56 |
+
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
| 57 |
+
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
| 60 |
|
| 61 |
// Utils
|
| 62 |
// Create a buffer and allocate all the tensors in a ggml_context
|
| 63 |
+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
| 64 |
+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
| 65 |
|
| 66 |
#ifdef __cplusplus
|
| 67 |
}
|
ggml-backend.c
CHANGED
|
@@ -475,6 +475,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
| 475 |
|
| 476 |
// backend CPU
|
| 477 |
|
|
|
|
|
|
|
| 478 |
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
| 479 |
return "CPU";
|
| 480 |
|
|
@@ -482,7 +484,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
|
|
| 482 |
}
|
| 483 |
|
| 484 |
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
}
|
| 487 |
|
| 488 |
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
@@ -540,8 +549,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
| 540 |
/* .reset = */ NULL,
|
| 541 |
};
|
| 542 |
|
| 543 |
-
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
| 544 |
-
|
| 545 |
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 546 |
return "CPU";
|
| 547 |
|
|
@@ -550,9 +557,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
|
|
| 550 |
|
| 551 |
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 552 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 553 |
-
void * data = malloc(size); // TODO:
|
| 554 |
-
|
| 555 |
-
|
|
|
|
|
|
|
| 556 |
|
| 557 |
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
| 558 |
}
|
|
@@ -766,6 +775,9 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
| 766 |
|
| 767 |
ggml_backend_t ggml_backend_cpu_init(void) {
|
| 768 |
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
|
|
|
|
|
|
|
|
|
| 769 |
|
| 770 |
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
| 771 |
ctx->work_data = NULL;
|
|
@@ -774,6 +786,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
| 774 |
ctx->abort_callback_data = NULL;
|
| 775 |
|
| 776 |
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
|
| 778 |
*cpu_backend = (struct ggml_backend) {
|
| 779 |
/* .interface = */ cpu_backend_i,
|
|
@@ -865,6 +881,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
|
|
| 865 |
ctx->n_buffers = n_buffers;
|
| 866 |
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
| 867 |
|
|
|
|
|
|
|
| 868 |
size_t total_size = 0;
|
| 869 |
for (size_t i = 0; i < n_buffers; i++) {
|
| 870 |
ctx->buffers[i] = buffers[i];
|
|
@@ -886,6 +904,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
|
| 886 |
}
|
| 887 |
}
|
| 888 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 889 |
|
| 890 |
// scheduler
|
| 891 |
|
|
@@ -894,7 +924,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
|
| 894 |
#define GGML_MAX_SPLIT_INPUTS 16
|
| 895 |
|
| 896 |
struct ggml_backend_sched_split {
|
| 897 |
-
|
| 898 |
int i_start;
|
| 899 |
int i_end;
|
| 900 |
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
|
@@ -909,15 +939,17 @@ struct ggml_backend_sched {
|
|
| 909 |
int n_backends;
|
| 910 |
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
| 911 |
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
| 912 |
-
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
| 913 |
|
| 914 |
ggml_gallocr_t galloc;
|
| 915 |
|
| 916 |
// hash keys of the nodes in the graph
|
| 917 |
struct ggml_hash_set hash_set;
|
| 918 |
-
// hash values
|
| 919 |
-
|
| 920 |
-
struct ggml_tensor * (*
|
|
|
|
|
|
|
|
|
|
| 921 |
|
| 922 |
// copy of the graph with modified inputs
|
| 923 |
struct ggml_cgraph * graph;
|
|
@@ -927,77 +959,46 @@ struct ggml_backend_sched {
|
|
| 927 |
|
| 928 |
struct ggml_context * ctx;
|
| 929 |
|
|
|
|
|
|
|
|
|
|
| 930 |
// align context_buffer to GGML_MEM_ALIGN
|
| 931 |
#ifdef _MSC_VER
|
| 932 |
__declspec(align(GGML_MEM_ALIGN))
|
| 933 |
#else
|
| 934 |
__attribute__((aligned(GGML_MEM_ALIGN)))
|
| 935 |
#endif
|
| 936 |
-
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
| 937 |
-
|
| 938 |
-
ggml_backend_sched_eval_callback callback_eval;
|
| 939 |
-
void * callback_eval_user_data;
|
| 940 |
};
|
| 941 |
|
| 942 |
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
| 943 |
-
#define
|
| 944 |
-
|
| 945 |
-
static bool ggml_is_view_op(enum ggml_op op) {
|
| 946 |
-
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
| 947 |
-
}
|
| 948 |
|
| 949 |
-
// returns the priority of the backend, lower is
|
| 950 |
-
static int
|
| 951 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 952 |
if (sched->backends[i] == backend) {
|
| 953 |
return i;
|
| 954 |
}
|
| 955 |
}
|
| 956 |
-
return
|
| 957 |
}
|
| 958 |
|
| 959 |
-
static int
|
| 960 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 961 |
-
if (sched->tallocs[i] == allocr) {
|
| 962 |
-
return i;
|
| 963 |
-
}
|
| 964 |
-
}
|
| 965 |
-
return INT_MAX;
|
| 966 |
-
}
|
| 967 |
-
|
| 968 |
-
static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
| 969 |
if (buffer == NULL) {
|
| 970 |
-
return
|
| 971 |
-
}
|
| 972 |
-
|
| 973 |
-
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
| 974 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 975 |
-
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
| 976 |
-
return sched->tallocs[i];
|
| 977 |
-
}
|
| 978 |
}
|
| 979 |
|
| 980 |
// find highest prio backend that supports the buffer type
|
| 981 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 982 |
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
| 983 |
-
return
|
| 984 |
}
|
| 985 |
}
|
| 986 |
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
| 987 |
}
|
| 988 |
|
| 989 |
-
static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
| 990 |
-
if (allocr == NULL) {
|
| 991 |
-
return NULL;
|
| 992 |
-
}
|
| 993 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 994 |
-
if (sched->tallocs[i] == allocr) {
|
| 995 |
-
return sched->backends[i];
|
| 996 |
-
}
|
| 997 |
-
}
|
| 998 |
-
GGML_UNREACHABLE();
|
| 999 |
-
}
|
| 1000 |
-
|
| 1001 |
#if 0
|
| 1002 |
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
| 1003 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
|
@@ -1008,37 +1009,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
|
|
| 1008 |
#endif
|
| 1009 |
|
| 1010 |
// returns the backend that should be used for the node based on the current locations
|
| 1011 |
-
static
|
|
|
|
|
|
|
| 1012 |
// assign pre-allocated nodes to their backend
|
| 1013 |
// dst
|
| 1014 |
-
|
| 1015 |
-
if (
|
| 1016 |
SET_CAUSE(node, "1.dst");
|
| 1017 |
-
return
|
| 1018 |
}
|
| 1019 |
// view_src
|
| 1020 |
-
if (
|
| 1021 |
-
|
| 1022 |
-
if (
|
| 1023 |
SET_CAUSE(node, "1.vsrc");
|
| 1024 |
-
return
|
| 1025 |
}
|
| 1026 |
}
|
| 1027 |
// assign nodes that use weights to the backend of the weights
|
| 1028 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1029 |
-
const struct ggml_tensor * src =
|
| 1030 |
if (src == NULL) {
|
| 1031 |
break;
|
| 1032 |
}
|
| 1033 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1034 |
-
|
| 1035 |
// operations with weights are always run on the same backend as the weights
|
| 1036 |
SET_CAUSE(node, "1.wgt%d", i);
|
| 1037 |
-
return
|
| 1038 |
}
|
| 1039 |
}
|
| 1040 |
|
| 1041 |
-
return
|
| 1042 |
}
|
| 1043 |
|
| 1044 |
static char * fmt_size(size_t size) {
|
|
@@ -1051,11 +1054,11 @@ static char * fmt_size(size_t size) {
|
|
| 1051 |
return buffer;
|
| 1052 |
}
|
| 1053 |
|
| 1054 |
-
static void
|
| 1055 |
int cur_split = 0;
|
| 1056 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1057 |
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
| 1058 |
-
ggml_backend_t split_backend =
|
| 1059 |
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
| 1060 |
sched->splits[cur_split].n_inputs);
|
| 1061 |
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
|
@@ -1069,17 +1072,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
| 1069 |
if (ggml_is_view_op(node->op)) {
|
| 1070 |
continue;
|
| 1071 |
}
|
| 1072 |
-
|
| 1073 |
-
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
| 1074 |
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1075 |
-
fmt_size(ggml_nbytes(node)),
|
| 1076 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1077 |
struct ggml_tensor * src = node->src[j];
|
| 1078 |
if (src == NULL) {
|
| 1079 |
break;
|
| 1080 |
}
|
| 1081 |
-
|
| 1082 |
-
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
| 1083 |
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1084 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1085 |
}
|
|
@@ -1087,23 +1088,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
| 1087 |
}
|
| 1088 |
}
|
| 1089 |
|
| 1090 |
-
// creates a copy of the tensor with the same memory layout
|
| 1091 |
-
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
| 1092 |
-
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
| 1093 |
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 1094 |
-
dup->nb[i] = tensor->nb[i];
|
| 1095 |
-
}
|
| 1096 |
-
return dup;
|
| 1097 |
-
}
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
//#define DEBUG_PASS1
|
| 1101 |
//#define DEBUG_PASS2
|
| 1102 |
//#define DEBUG_PASS3
|
| 1103 |
//#define DEBUG_PASS4
|
| 1104 |
|
| 1105 |
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
| 1106 |
-
static void
|
| 1107 |
// reset splits
|
| 1108 |
sched->n_splits = 0;
|
| 1109 |
sched->is_reset = false;
|
|
@@ -1125,28 +1116,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1125 |
// pass 1: assign backends to ops with pre-allocated inputs
|
| 1126 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1127 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1128 |
-
if (
|
| 1129 |
// do not overwrite user assignments
|
| 1130 |
continue;
|
| 1131 |
}
|
| 1132 |
-
|
| 1133 |
}
|
| 1134 |
|
| 1135 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1136 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1137 |
-
if (
|
| 1138 |
// do not overwrite user assignments
|
| 1139 |
continue;
|
| 1140 |
}
|
| 1141 |
-
|
| 1142 |
// src
|
| 1143 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1144 |
struct ggml_tensor * src = node->src[j];
|
| 1145 |
if (src == NULL) {
|
| 1146 |
break;
|
| 1147 |
}
|
| 1148 |
-
if (
|
| 1149 |
-
|
| 1150 |
}
|
| 1151 |
}
|
| 1152 |
}
|
|
@@ -1161,22 +1152,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1161 |
|
| 1162 |
// pass 2.1 expand gpu up
|
| 1163 |
{
|
| 1164 |
-
|
| 1165 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1166 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1167 |
if (ggml_is_view_op(node->op)) {
|
| 1168 |
continue;
|
| 1169 |
}
|
| 1170 |
-
|
| 1171 |
-
if (
|
| 1172 |
-
if (
|
| 1173 |
// skip cpu (lowest prio backend)
|
| 1174 |
-
|
| 1175 |
} else {
|
| 1176 |
-
|
| 1177 |
}
|
| 1178 |
} else {
|
| 1179 |
-
|
| 1180 |
SET_CAUSE(node, "2.1");
|
| 1181 |
}
|
| 1182 |
}
|
|
@@ -1184,22 +1175,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1184 |
|
| 1185 |
// pass 2.2 expand gpu down
|
| 1186 |
{
|
| 1187 |
-
|
| 1188 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1189 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1190 |
if (ggml_is_view_op(node->op)) {
|
| 1191 |
continue;
|
| 1192 |
}
|
| 1193 |
-
|
| 1194 |
-
if (
|
| 1195 |
-
if (
|
| 1196 |
// skip cpu (lowest prio backend)
|
| 1197 |
-
|
| 1198 |
} else {
|
| 1199 |
-
|
| 1200 |
}
|
| 1201 |
} else {
|
| 1202 |
-
|
| 1203 |
SET_CAUSE(node, "2.2");
|
| 1204 |
}
|
| 1205 |
}
|
|
@@ -1207,17 +1198,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1207 |
|
| 1208 |
// pass 2.3 expand rest up
|
| 1209 |
{
|
| 1210 |
-
|
| 1211 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1212 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1213 |
if (ggml_is_view_op(node->op)) {
|
| 1214 |
continue;
|
| 1215 |
}
|
| 1216 |
-
|
| 1217 |
-
if (
|
| 1218 |
-
|
| 1219 |
} else {
|
| 1220 |
-
|
| 1221 |
SET_CAUSE(node, "2.3");
|
| 1222 |
}
|
| 1223 |
}
|
|
@@ -1225,17 +1216,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1225 |
|
| 1226 |
// pass 2.4 expand rest down
|
| 1227 |
{
|
| 1228 |
-
|
| 1229 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1230 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1231 |
if (ggml_is_view_op(node->op)) {
|
| 1232 |
continue;
|
| 1233 |
}
|
| 1234 |
-
|
| 1235 |
-
if (
|
| 1236 |
-
|
| 1237 |
} else {
|
| 1238 |
-
|
| 1239 |
SET_CAUSE(node, "2.4");
|
| 1240 |
}
|
| 1241 |
}
|
|
@@ -1247,9 +1238,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1247 |
// pass 3: assign backends to remaining src from dst and view_src
|
| 1248 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1249 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1250 |
-
|
| 1251 |
-
if (node->view_src != NULL &&
|
| 1252 |
-
|
| 1253 |
SET_CAUSE(node, "3.vsrc");
|
| 1254 |
}
|
| 1255 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
@@ -1257,14 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1257 |
if (src == NULL) {
|
| 1258 |
break;
|
| 1259 |
}
|
| 1260 |
-
|
| 1261 |
-
if (
|
| 1262 |
if (src->view_src != NULL) {
|
| 1263 |
// views are always on the same backend as the source
|
| 1264 |
-
|
| 1265 |
SET_CAUSE(src, "3.vsrc");
|
| 1266 |
} else {
|
| 1267 |
-
|
| 1268 |
SET_CAUSE(src, "3.cur");
|
| 1269 |
}
|
| 1270 |
}
|
|
@@ -1281,15 +1272,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1281 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1282 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1283 |
if (!ggml_is_view_op(node->op)) {
|
| 1284 |
-
sched->splits[0].
|
| 1285 |
break;
|
| 1286 |
}
|
| 1287 |
}
|
| 1288 |
sched->splits[0].i_start = 0;
|
| 1289 |
sched->splits[0].n_inputs = 0;
|
| 1290 |
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
| 1291 |
-
|
| 1292 |
-
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
| 1293 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1294 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1295 |
|
|
@@ -1297,19 +1287,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1297 |
continue;
|
| 1298 |
}
|
| 1299 |
|
| 1300 |
-
|
| 1301 |
|
| 1302 |
-
GGML_ASSERT(
|
| 1303 |
|
| 1304 |
-
if (
|
| 1305 |
sched->splits[cur_split].i_end = i;
|
| 1306 |
cur_split++;
|
| 1307 |
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
| 1308 |
-
sched->splits[cur_split].
|
| 1309 |
sched->splits[cur_split].i_start = i;
|
| 1310 |
sched->splits[cur_split].n_inputs = 0;
|
| 1311 |
-
|
| 1312 |
-
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
| 1313 |
}
|
| 1314 |
|
| 1315 |
// find inputs that are not on the same backend
|
|
@@ -1318,43 +1307,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1318 |
if (src == NULL) {
|
| 1319 |
break;
|
| 1320 |
}
|
| 1321 |
-
|
| 1322 |
-
|
| 1323 |
-
if (
|
| 1324 |
// create a copy of the input in the split's backend
|
| 1325 |
size_t id = hash_id(src);
|
| 1326 |
-
if (sched->
|
| 1327 |
-
ggml_backend_t backend =
|
| 1328 |
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1329 |
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
| 1330 |
|
| 1331 |
-
sched->
|
| 1332 |
-
|
| 1333 |
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1334 |
|
| 1335 |
int n_inputs = sched->splits[cur_split].n_inputs++;
|
| 1336 |
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
| 1337 |
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1338 |
}
|
| 1339 |
-
node->src[j] = sched->
|
| 1340 |
-
|
| 1341 |
-
#if 0
|
| 1342 |
-
// check if the input is already in the split
|
| 1343 |
-
bool found = false;
|
| 1344 |
-
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
| 1345 |
-
if (sched->splits[cur_split].inputs[k] == src) {
|
| 1346 |
-
found = true;
|
| 1347 |
-
break;
|
| 1348 |
-
}
|
| 1349 |
-
}
|
| 1350 |
-
|
| 1351 |
-
if (!found) {
|
| 1352 |
-
int n_inputs = sched->splits[cur_split].n_inputs++;
|
| 1353 |
-
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
| 1354 |
-
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
| 1355 |
-
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1356 |
-
}
|
| 1357 |
-
#endif
|
| 1358 |
}
|
| 1359 |
}
|
| 1360 |
}
|
|
@@ -1369,30 +1340,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1369 |
// sanity check: all sources should have the same backend as the node
|
| 1370 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1371 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1372 |
-
|
| 1373 |
-
if (
|
| 1374 |
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
| 1375 |
}
|
| 1376 |
-
if (node->view_src != NULL &&
|
| 1377 |
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
| 1378 |
-
node->name,
|
| 1379 |
-
node->view_src->name,
|
| 1380 |
}
|
| 1381 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1382 |
struct ggml_tensor * src = node->src[j];
|
| 1383 |
if (src == NULL) {
|
| 1384 |
break;
|
| 1385 |
}
|
| 1386 |
-
|
| 1387 |
-
if (
|
| 1388 |
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
| 1389 |
-
node->name,
|
| 1390 |
-
j, src->name,
|
| 1391 |
}
|
| 1392 |
-
if (src->view_src != NULL &&
|
| 1393 |
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
| 1394 |
-
src->name,
|
| 1395 |
-
src->view_src->name,
|
| 1396 |
}
|
| 1397 |
}
|
| 1398 |
}
|
|
@@ -1406,32 +1377,43 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1406 |
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1407 |
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1408 |
|
| 1409 |
-
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
| 1410 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1411 |
struct ggml_tensor * input = split->inputs[j];
|
| 1412 |
-
struct ggml_tensor * input_cpy = sched->
|
|
|
|
| 1413 |
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1414 |
-
|
| 1415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1416 |
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
| 1417 |
}
|
| 1418 |
|
| 1419 |
for (int j = split->i_start; j < split->i_end; j++) {
|
|
|
|
| 1420 |
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1421 |
}
|
| 1422 |
}
|
| 1423 |
sched->graph = graph_copy;
|
| 1424 |
}
|
| 1425 |
|
| 1426 |
-
static
|
| 1427 |
-
|
| 1428 |
-
|
| 1429 |
-
|
| 1430 |
-
|
| 1431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1432 |
}
|
| 1433 |
|
| 1434 |
-
static
|
| 1435 |
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
| 1436 |
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
| 1437 |
|
|
@@ -1439,20 +1421,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1439 |
|
| 1440 |
for (int i = 0; i < sched->n_splits; i++) {
|
| 1441 |
struct ggml_backend_sched_split * split = &splits[i];
|
| 1442 |
-
|
| 1443 |
-
|
| 1444 |
|
| 1445 |
// copy the input tensors to the split backend
|
| 1446 |
uint64_t copy_start_us = ggml_time_us();
|
| 1447 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1448 |
struct ggml_tensor * input = split->inputs[j];
|
| 1449 |
-
struct ggml_tensor * input_cpy = sched->
|
| 1450 |
|
| 1451 |
GGML_ASSERT(input->buffer != NULL);
|
| 1452 |
GGML_ASSERT(input_cpy->buffer != NULL);
|
| 1453 |
|
| 1454 |
-
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
| 1455 |
-
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
| 1456 |
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
| 1457 |
}
|
| 1458 |
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
|
@@ -1468,7 +1448,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1468 |
|
| 1469 |
uint64_t compute_start_us = ggml_time_us();
|
| 1470 |
if (!sched->callback_eval) {
|
| 1471 |
-
ggml_backend_graph_compute(split_backend, &split->graph)
|
|
|
|
|
|
|
| 1472 |
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1473 |
} else {
|
| 1474 |
// similar to ggml_backend_compare_graph_backend
|
|
@@ -1488,7 +1470,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1488 |
|
| 1489 |
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1490 |
|
| 1491 |
-
ggml_backend_graph_compute(split_backend, &gv)
|
|
|
|
|
|
|
| 1492 |
|
| 1493 |
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
| 1494 |
break;
|
|
@@ -1510,19 +1494,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1510 |
}
|
| 1511 |
}
|
| 1512 |
#endif
|
| 1513 |
-
}
|
| 1514 |
-
|
| 1515 |
-
static void sched_reset(ggml_backend_sched_t sched) {
|
| 1516 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 1517 |
-
ggml_tallocr_reset(sched->tallocs[i]);
|
| 1518 |
-
}
|
| 1519 |
-
// reset state for the next run
|
| 1520 |
-
size_t hash_size = sched->hash_set.size;
|
| 1521 |
-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
| 1522 |
-
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
| 1523 |
-
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
| 1524 |
|
| 1525 |
-
|
| 1526 |
}
|
| 1527 |
|
| 1528 |
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
|
@@ -1532,9 +1505,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
|
| 1532 |
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
| 1533 |
|
| 1534 |
// initialize hash table
|
| 1535 |
-
sched->hash_set
|
| 1536 |
-
sched->
|
| 1537 |
-
sched->
|
|
|
|
| 1538 |
|
| 1539 |
sched->n_backends = n_backends;
|
| 1540 |
for (int i = 0; i < n_backends; i++) {
|
|
@@ -1542,14 +1516,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
|
| 1542 |
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
| 1543 |
}
|
| 1544 |
|
| 1545 |
-
sched->galloc =
|
| 1546 |
|
| 1547 |
-
|
| 1548 |
-
for (int i = 0; i < n_backends; i++) {
|
| 1549 |
-
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
| 1550 |
-
}
|
| 1551 |
-
|
| 1552 |
-
sched_reset(sched);
|
| 1553 |
|
| 1554 |
return sched;
|
| 1555 |
}
|
|
@@ -1558,49 +1527,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
| 1558 |
if (sched == NULL) {
|
| 1559 |
return;
|
| 1560 |
}
|
| 1561 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 1562 |
-
ggml_tallocr_free(sched->tallocs[i]);
|
| 1563 |
-
}
|
| 1564 |
ggml_gallocr_free(sched->galloc);
|
| 1565 |
ggml_free(sched->ctx);
|
| 1566 |
free(sched->hash_set.keys);
|
| 1567 |
-
free(sched->
|
| 1568 |
-
free(sched->
|
|
|
|
| 1569 |
free(sched);
|
| 1570 |
}
|
| 1571 |
|
| 1572 |
-
void
|
| 1573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1574 |
|
| 1575 |
-
|
| 1576 |
-
|
| 1577 |
|
| 1578 |
-
|
| 1579 |
-
|
| 1580 |
-
|
| 1581 |
-
|
| 1582 |
-
|
| 1583 |
}
|
| 1584 |
|
| 1585 |
-
|
|
|
|
| 1586 |
}
|
| 1587 |
|
| 1588 |
-
|
| 1589 |
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
| 1590 |
|
| 1591 |
if (!sched->is_reset) {
|
| 1592 |
-
|
| 1593 |
}
|
| 1594 |
|
| 1595 |
-
|
| 1596 |
-
|
| 1597 |
-
|
| 1598 |
-
}
|
| 1599 |
|
| 1600 |
-
|
| 1601 |
-
|
| 1602 |
-
}
|
| 1603 |
|
|
|
|
|
|
|
| 1604 |
|
| 1605 |
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
| 1606 |
sched->callback_eval = callback;
|
|
@@ -1611,37 +1585,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
| 1611 |
return sched->n_splits;
|
| 1612 |
}
|
| 1613 |
|
| 1614 |
-
|
| 1615 |
-
int backend_index =
|
| 1616 |
-
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1617 |
-
return sched->tallocs[backend_index];
|
| 1618 |
-
}
|
| 1619 |
-
|
| 1620 |
-
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1621 |
-
int backend_index = sched_backend_prio(sched, backend);
|
| 1622 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1623 |
-
return
|
| 1624 |
}
|
| 1625 |
|
| 1626 |
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
| 1627 |
-
int backend_index =
|
| 1628 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1629 |
-
|
| 1630 |
}
|
| 1631 |
|
| 1632 |
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
| 1633 |
-
|
| 1634 |
-
if (
|
| 1635 |
return NULL;
|
| 1636 |
}
|
| 1637 |
-
return
|
| 1638 |
}
|
| 1639 |
|
| 1640 |
// utils
|
| 1641 |
|
| 1642 |
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 1643 |
GGML_ASSERT(tensor->buffer == NULL);
|
| 1644 |
-
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
| 1645 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1646 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1647 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
@@ -1665,7 +1632,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
|
|
| 1665 |
ggml_backend_buffer_init_tensor(buffer, tensor);
|
| 1666 |
}
|
| 1667 |
|
| 1668 |
-
static struct ggml_tensor *
|
| 1669 |
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
| 1670 |
|
| 1671 |
GGML_ASSERT(src != NULL);
|
|
@@ -1678,7 +1645,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
|
| 1678 |
|
| 1679 |
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
| 1680 |
if (src->view_src != NULL) {
|
| 1681 |
-
dst->view_src =
|
| 1682 |
dst->view_offs = src->view_offs;
|
| 1683 |
}
|
| 1684 |
dst->op = src->op;
|
|
@@ -1691,14 +1658,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
|
| 1691 |
if (s == NULL) {
|
| 1692 |
break;
|
| 1693 |
}
|
| 1694 |
-
dst->src[i] =
|
| 1695 |
}
|
| 1696 |
|
| 1697 |
node_copies[id] = dst;
|
| 1698 |
return dst;
|
| 1699 |
}
|
| 1700 |
|
| 1701 |
-
static void
|
| 1702 |
size_t id = ggml_hash_find(hash_set, src);
|
| 1703 |
if (node_init[id]) {
|
| 1704 |
return;
|
|
@@ -1707,7 +1674,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
| 1707 |
|
| 1708 |
struct ggml_tensor * dst = node_copies[id];
|
| 1709 |
if (dst->view_src != NULL) {
|
| 1710 |
-
|
| 1711 |
ggml_backend_view_init(dst->view_src->buffer, dst);
|
| 1712 |
}
|
| 1713 |
else {
|
|
@@ -1720,17 +1687,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
| 1720 |
if (s == NULL) {
|
| 1721 |
break;
|
| 1722 |
}
|
| 1723 |
-
|
| 1724 |
}
|
| 1725 |
}
|
| 1726 |
|
| 1727 |
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
| 1728 |
struct ggml_hash_set hash_set = {
|
| 1729 |
/* .size = */ graph->visited_hash_table.size,
|
| 1730 |
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
| 1731 |
};
|
| 1732 |
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
| 1733 |
-
bool * node_init = calloc(sizeof(node_init[0])
|
| 1734 |
|
| 1735 |
struct ggml_init_params params = {
|
| 1736 |
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
|
@@ -1759,7 +1726,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
| 1759 |
// dup nodes
|
| 1760 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1761 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1762 |
-
|
| 1763 |
}
|
| 1764 |
|
| 1765 |
// allocate nodes
|
|
@@ -1784,7 +1751,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
| 1784 |
// copy data and init views
|
| 1785 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1786 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1787 |
-
|
| 1788 |
}
|
| 1789 |
|
| 1790 |
// build graph copy
|
|
|
|
| 475 |
|
| 476 |
// backend CPU
|
| 477 |
|
| 478 |
+
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
| 479 |
+
|
| 480 |
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
| 481 |
return "CPU";
|
| 482 |
|
|
|
|
| 484 |
}
|
| 485 |
|
| 486 |
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 487 |
+
uintptr_t data = (uintptr_t)buffer->context;
|
| 488 |
+
|
| 489 |
+
// align the buffer
|
| 490 |
+
if (data % TENSOR_ALIGNMENT != 0) {
|
| 491 |
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
return (void *)data;
|
| 495 |
}
|
| 496 |
|
| 497 |
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
|
|
| 549 |
/* .reset = */ NULL,
|
| 550 |
};
|
| 551 |
|
|
|
|
|
|
|
| 552 |
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 553 |
return "CPU";
|
| 554 |
|
|
|
|
| 557 |
|
| 558 |
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 559 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 560 |
+
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
| 561 |
+
if (data == NULL) {
|
| 562 |
+
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 563 |
+
return NULL;
|
| 564 |
+
}
|
| 565 |
|
| 566 |
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
| 567 |
}
|
|
|
|
| 775 |
|
| 776 |
ggml_backend_t ggml_backend_cpu_init(void) {
|
| 777 |
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
| 778 |
+
if (ctx == NULL) {
|
| 779 |
+
return NULL;
|
| 780 |
+
}
|
| 781 |
|
| 782 |
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
| 783 |
ctx->work_data = NULL;
|
|
|
|
| 786 |
ctx->abort_callback_data = NULL;
|
| 787 |
|
| 788 |
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
| 789 |
+
if (cpu_backend == NULL) {
|
| 790 |
+
free(ctx);
|
| 791 |
+
return NULL;
|
| 792 |
+
}
|
| 793 |
|
| 794 |
*cpu_backend = (struct ggml_backend) {
|
| 795 |
/* .interface = */ cpu_backend_i,
|
|
|
|
| 881 |
ctx->n_buffers = n_buffers;
|
| 882 |
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
| 883 |
|
| 884 |
+
GGML_ASSERT(ctx->buffers != NULL);
|
| 885 |
+
|
| 886 |
size_t total_size = 0;
|
| 887 |
for (size_t i = 0; i < n_buffers; i++) {
|
| 888 |
ctx->buffers[i] = buffers[i];
|
|
|
|
| 904 |
}
|
| 905 |
}
|
| 906 |
|
| 907 |
+
// creates a copy of the tensor with the same memory layout
|
| 908 |
+
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
| 909 |
+
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
| 910 |
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 911 |
+
dup->nb[i] = tensor->nb[i];
|
| 912 |
+
}
|
| 913 |
+
return dup;
|
| 914 |
+
}
|
| 915 |
+
|
| 916 |
+
static bool ggml_is_view_op(enum ggml_op op) {
|
| 917 |
+
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
| 918 |
+
}
|
| 919 |
|
| 920 |
// scheduler
|
| 921 |
|
|
|
|
| 924 |
#define GGML_MAX_SPLIT_INPUTS 16
|
| 925 |
|
| 926 |
struct ggml_backend_sched_split {
|
| 927 |
+
int backend_id;
|
| 928 |
int i_start;
|
| 929 |
int i_end;
|
| 930 |
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
|
|
|
| 939 |
int n_backends;
|
| 940 |
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
| 941 |
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
|
|
|
| 942 |
|
| 943 |
ggml_gallocr_t galloc;
|
| 944 |
|
| 945 |
// hash keys of the nodes in the graph
|
| 946 |
struct ggml_hash_set hash_set;
|
| 947 |
+
// hash values
|
| 948 |
+
int * tensor_backend_id;
|
| 949 |
+
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
|
| 950 |
+
|
| 951 |
+
int * node_backend_ids; // [n_nodes]
|
| 952 |
+
int n_nodes;
|
| 953 |
|
| 954 |
// copy of the graph with modified inputs
|
| 955 |
struct ggml_cgraph * graph;
|
|
|
|
| 959 |
|
| 960 |
struct ggml_context * ctx;
|
| 961 |
|
| 962 |
+
ggml_backend_sched_eval_callback callback_eval;
|
| 963 |
+
void * callback_eval_user_data;
|
| 964 |
+
|
| 965 |
// align context_buffer to GGML_MEM_ALIGN
|
| 966 |
#ifdef _MSC_VER
|
| 967 |
__declspec(align(GGML_MEM_ALIGN))
|
| 968 |
#else
|
| 969 |
__attribute__((aligned(GGML_MEM_ALIGN)))
|
| 970 |
#endif
|
| 971 |
+
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
|
|
|
|
|
|
|
|
|
| 972 |
};
|
| 973 |
|
| 974 |
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
| 975 |
+
#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
|
| 976 |
+
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
|
|
|
|
|
|
|
|
|
| 977 |
|
| 978 |
+
// returns the priority of the backend, lower id is higher priority
|
| 979 |
+
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 980 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 981 |
if (sched->backends[i] == backend) {
|
| 982 |
return i;
|
| 983 |
}
|
| 984 |
}
|
| 985 |
+
return -1;
|
| 986 |
}
|
| 987 |
|
| 988 |
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 989 |
if (buffer == NULL) {
|
| 990 |
+
return -1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 991 |
}
|
| 992 |
|
| 993 |
// find highest prio backend that supports the buffer type
|
| 994 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 995 |
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
| 996 |
+
return i;
|
| 997 |
}
|
| 998 |
}
|
| 999 |
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
| 1000 |
}
|
| 1001 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1002 |
#if 0
|
| 1003 |
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
| 1004 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
|
|
|
| 1009 |
#endif
|
| 1010 |
|
| 1011 |
// returns the backend that should be used for the node based on the current locations
|
| 1012 |
+
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
| 1013 |
+
// TODO: use supports_op to check if the backend supports the op
|
| 1014 |
+
|
| 1015 |
// assign pre-allocated nodes to their backend
|
| 1016 |
// dst
|
| 1017 |
+
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
|
| 1018 |
+
if (cur_backend != -1) {
|
| 1019 |
SET_CAUSE(node, "1.dst");
|
| 1020 |
+
return cur_backend;
|
| 1021 |
}
|
| 1022 |
// view_src
|
| 1023 |
+
if (tensor->view_src != NULL) {
|
| 1024 |
+
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
|
| 1025 |
+
if (cur_backend != -1) {
|
| 1026 |
SET_CAUSE(node, "1.vsrc");
|
| 1027 |
+
return cur_backend;
|
| 1028 |
}
|
| 1029 |
}
|
| 1030 |
// assign nodes that use weights to the backend of the weights
|
| 1031 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1032 |
+
const struct ggml_tensor * src = tensor->src[i];
|
| 1033 |
if (src == NULL) {
|
| 1034 |
break;
|
| 1035 |
}
|
| 1036 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1037 |
+
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
| 1038 |
// operations with weights are always run on the same backend as the weights
|
| 1039 |
SET_CAUSE(node, "1.wgt%d", i);
|
| 1040 |
+
return src_backend;
|
| 1041 |
}
|
| 1042 |
}
|
| 1043 |
|
| 1044 |
+
return -1;
|
| 1045 |
}
|
| 1046 |
|
| 1047 |
static char * fmt_size(size_t size) {
|
|
|
|
| 1054 |
return buffer;
|
| 1055 |
}
|
| 1056 |
|
| 1057 |
+
static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1058 |
int cur_split = 0;
|
| 1059 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1060 |
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
| 1061 |
+
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
| 1062 |
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
| 1063 |
sched->splits[cur_split].n_inputs);
|
| 1064 |
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
|
|
|
| 1072 |
if (ggml_is_view_op(node->op)) {
|
| 1073 |
continue;
|
| 1074 |
}
|
| 1075 |
+
ggml_backend_t tensor_backend = tensor_backend(node);
|
|
|
|
| 1076 |
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1077 |
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1078 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1079 |
struct ggml_tensor * src = node->src[j];
|
| 1080 |
if (src == NULL) {
|
| 1081 |
break;
|
| 1082 |
}
|
| 1083 |
+
ggml_backend_t src_backend = tensor_backend(src);
|
|
|
|
| 1084 |
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1085 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1086 |
}
|
|
|
|
| 1088 |
}
|
| 1089 |
}
|
| 1090 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1091 |
//#define DEBUG_PASS1
|
| 1092 |
//#define DEBUG_PASS2
|
| 1093 |
//#define DEBUG_PASS3
|
| 1094 |
//#define DEBUG_PASS4
|
| 1095 |
|
| 1096 |
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
| 1097 |
+
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1098 |
// reset splits
|
| 1099 |
sched->n_splits = 0;
|
| 1100 |
sched->is_reset = false;
|
|
|
|
| 1116 |
// pass 1: assign backends to ops with pre-allocated inputs
|
| 1117 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1118 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1119 |
+
if (tensor_backend_id(leaf) != -1) {
|
| 1120 |
// do not overwrite user assignments
|
| 1121 |
continue;
|
| 1122 |
}
|
| 1123 |
+
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
| 1124 |
}
|
| 1125 |
|
| 1126 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1127 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1128 |
+
if (tensor_backend_id(node) != -1) {
|
| 1129 |
// do not overwrite user assignments
|
| 1130 |
continue;
|
| 1131 |
}
|
| 1132 |
+
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
| 1133 |
// src
|
| 1134 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1135 |
struct ggml_tensor * src = node->src[j];
|
| 1136 |
if (src == NULL) {
|
| 1137 |
break;
|
| 1138 |
}
|
| 1139 |
+
if (tensor_backend_id(src) == -1) {
|
| 1140 |
+
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
| 1141 |
}
|
| 1142 |
}
|
| 1143 |
}
|
|
|
|
| 1152 |
|
| 1153 |
// pass 2.1 expand gpu up
|
| 1154 |
{
|
| 1155 |
+
int cur_backend_id = -1;
|
| 1156 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1157 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1158 |
if (ggml_is_view_op(node->op)) {
|
| 1159 |
continue;
|
| 1160 |
}
|
| 1161 |
+
int tensor_backend_id = tensor_backend_id(node);
|
| 1162 |
+
if (tensor_backend_id != -1) {
|
| 1163 |
+
if (tensor_backend_id == sched->n_backends - 1) {
|
| 1164 |
// skip cpu (lowest prio backend)
|
| 1165 |
+
cur_backend_id = -1;
|
| 1166 |
} else {
|
| 1167 |
+
cur_backend_id = tensor_backend_id;
|
| 1168 |
}
|
| 1169 |
} else {
|
| 1170 |
+
tensor_backend_id(node) = cur_backend_id;
|
| 1171 |
SET_CAUSE(node, "2.1");
|
| 1172 |
}
|
| 1173 |
}
|
|
|
|
| 1175 |
|
| 1176 |
// pass 2.2 expand gpu down
|
| 1177 |
{
|
| 1178 |
+
int cur_backend_id = -1;
|
| 1179 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1180 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1181 |
if (ggml_is_view_op(node->op)) {
|
| 1182 |
continue;
|
| 1183 |
}
|
| 1184 |
+
int tensor_backend_id = tensor_backend_id(node);
|
| 1185 |
+
if (tensor_backend_id != -1) {
|
| 1186 |
+
if (tensor_backend_id == sched->n_backends - 1) {
|
| 1187 |
// skip cpu (lowest prio backend)
|
| 1188 |
+
cur_backend_id = -1;
|
| 1189 |
} else {
|
| 1190 |
+
cur_backend_id = tensor_backend_id;
|
| 1191 |
}
|
| 1192 |
} else {
|
| 1193 |
+
tensor_backend_id(node) = cur_backend_id;
|
| 1194 |
SET_CAUSE(node, "2.2");
|
| 1195 |
}
|
| 1196 |
}
|
|
|
|
| 1198 |
|
| 1199 |
// pass 2.3 expand rest up
|
| 1200 |
{
|
| 1201 |
+
int cur_backend_id = -1;
|
| 1202 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1203 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1204 |
if (ggml_is_view_op(node->op)) {
|
| 1205 |
continue;
|
| 1206 |
}
|
| 1207 |
+
int tensor_backend_id = tensor_backend_id(node);
|
| 1208 |
+
if (tensor_backend_id != -1) {
|
| 1209 |
+
cur_backend_id = tensor_backend_id;
|
| 1210 |
} else {
|
| 1211 |
+
tensor_backend_id(node) = cur_backend_id;
|
| 1212 |
SET_CAUSE(node, "2.3");
|
| 1213 |
}
|
| 1214 |
}
|
|
|
|
| 1216 |
|
| 1217 |
// pass 2.4 expand rest down
|
| 1218 |
{
|
| 1219 |
+
int cur_backend_id = -1;
|
| 1220 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1221 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1222 |
if (ggml_is_view_op(node->op)) {
|
| 1223 |
continue;
|
| 1224 |
}
|
| 1225 |
+
int tensor_backend_id = tensor_backend_id(node);
|
| 1226 |
+
if (tensor_backend_id != -1) {
|
| 1227 |
+
cur_backend_id = tensor_backend_id;
|
| 1228 |
} else {
|
| 1229 |
+
tensor_backend_id(node) = cur_backend_id;
|
| 1230 |
SET_CAUSE(node, "2.4");
|
| 1231 |
}
|
| 1232 |
}
|
|
|
|
| 1238 |
// pass 3: assign backends to remaining src from dst and view_src
|
| 1239 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1240 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1241 |
+
int cur_backend_id = tensor_backend_id(node);
|
| 1242 |
+
if (node->view_src != NULL && cur_backend_id == -1) {
|
| 1243 |
+
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
| 1244 |
SET_CAUSE(node, "3.vsrc");
|
| 1245 |
}
|
| 1246 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
|
|
| 1248 |
if (src == NULL) {
|
| 1249 |
break;
|
| 1250 |
}
|
| 1251 |
+
int src_backend_id = tensor_backend_id(src);
|
| 1252 |
+
if (src_backend_id == -1) {
|
| 1253 |
if (src->view_src != NULL) {
|
| 1254 |
// views are always on the same backend as the source
|
| 1255 |
+
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
| 1256 |
SET_CAUSE(src, "3.vsrc");
|
| 1257 |
} else {
|
| 1258 |
+
tensor_backend_id(src) = cur_backend_id;
|
| 1259 |
SET_CAUSE(src, "3.cur");
|
| 1260 |
}
|
| 1261 |
}
|
|
|
|
| 1272 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1273 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1274 |
if (!ggml_is_view_op(node->op)) {
|
| 1275 |
+
sched->splits[0].backend_id = tensor_backend_id(node);
|
| 1276 |
break;
|
| 1277 |
}
|
| 1278 |
}
|
| 1279 |
sched->splits[0].i_start = 0;
|
| 1280 |
sched->splits[0].n_inputs = 0;
|
| 1281 |
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
| 1282 |
+
int cur_backend_id = sched->splits[0].backend_id;
|
|
|
|
| 1283 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1284 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1285 |
|
|
|
|
| 1287 |
continue;
|
| 1288 |
}
|
| 1289 |
|
| 1290 |
+
int tensor_backend_id = tensor_backend_id(node);
|
| 1291 |
|
| 1292 |
+
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
| 1293 |
|
| 1294 |
+
if (tensor_backend_id != cur_backend_id) {
|
| 1295 |
sched->splits[cur_split].i_end = i;
|
| 1296 |
cur_split++;
|
| 1297 |
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
| 1298 |
+
sched->splits[cur_split].backend_id = tensor_backend_id;
|
| 1299 |
sched->splits[cur_split].i_start = i;
|
| 1300 |
sched->splits[cur_split].n_inputs = 0;
|
| 1301 |
+
cur_backend_id = tensor_backend_id;
|
|
|
|
| 1302 |
}
|
| 1303 |
|
| 1304 |
// find inputs that are not on the same backend
|
|
|
|
| 1307 |
if (src == NULL) {
|
| 1308 |
break;
|
| 1309 |
}
|
| 1310 |
+
int src_backend_id = tensor_backend_id(src);
|
| 1311 |
+
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1312 |
+
if (src_backend_id != tensor_backend_id) {
|
| 1313 |
// create a copy of the input in the split's backend
|
| 1314 |
size_t id = hash_id(src);
|
| 1315 |
+
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
| 1316 |
+
ggml_backend_t backend = sched->backends[cur_backend_id];
|
| 1317 |
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1318 |
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
| 1319 |
|
| 1320 |
+
sched->tensor_copies[id][cur_backend_id] = tensor_copy;
|
| 1321 |
+
tensor_backend_id(tensor_copy) = cur_backend_id;
|
| 1322 |
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1323 |
|
| 1324 |
int n_inputs = sched->splits[cur_split].n_inputs++;
|
| 1325 |
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
| 1326 |
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1327 |
}
|
| 1328 |
+
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1329 |
}
|
| 1330 |
}
|
| 1331 |
}
|
|
|
|
| 1340 |
// sanity check: all sources should have the same backend as the node
|
| 1341 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1342 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1343 |
+
ggml_backend_t tensor_backend = tensor_backend(node);
|
| 1344 |
+
if (tensor_backend == NULL) {
|
| 1345 |
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
| 1346 |
}
|
| 1347 |
+
if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
|
| 1348 |
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
| 1349 |
+
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1350 |
+
node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
|
| 1351 |
}
|
| 1352 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1353 |
struct ggml_tensor * src = node->src[j];
|
| 1354 |
if (src == NULL) {
|
| 1355 |
break;
|
| 1356 |
}
|
| 1357 |
+
ggml_backend_t src_backend = tensor_backend(src);
|
| 1358 |
+
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
| 1359 |
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
| 1360 |
+
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1361 |
+
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
| 1362 |
}
|
| 1363 |
+
if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
|
| 1364 |
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
| 1365 |
+
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
| 1366 |
+
src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
|
| 1367 |
}
|
| 1368 |
}
|
| 1369 |
}
|
|
|
|
| 1377 |
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1378 |
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1379 |
|
|
|
|
| 1380 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1381 |
struct ggml_tensor * input = split->inputs[j];
|
| 1382 |
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
| 1383 |
+
|
| 1384 |
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1385 |
+
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
| 1386 |
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
| 1387 |
+
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
| 1388 |
+
|
| 1389 |
+
// add a dependency to the input copy so that it is allocated at the start of the split
|
| 1390 |
+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
| 1391 |
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
| 1392 |
}
|
| 1393 |
|
| 1394 |
for (int j = split->i_start; j < split->i_end; j++) {
|
| 1395 |
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
| 1396 |
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1397 |
}
|
| 1398 |
}
|
| 1399 |
sched->graph = graph_copy;
|
| 1400 |
}
|
| 1401 |
|
| 1402 |
+
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
| 1403 |
+
// ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
| 1404 |
+
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
| 1405 |
+
#ifndef NDEBUG
|
| 1406 |
+
fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
|
| 1407 |
+
#endif
|
| 1408 |
+
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
| 1409 |
+
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
| 1410 |
+
fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
|
| 1411 |
+
return false;
|
| 1412 |
+
}
|
| 1413 |
+
}
|
| 1414 |
}
|
| 1415 |
|
| 1416 |
+
static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
| 1417 |
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
| 1418 |
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
| 1419 |
|
|
|
|
| 1421 |
|
| 1422 |
for (int i = 0; i < sched->n_splits; i++) {
|
| 1423 |
struct ggml_backend_sched_split * split = &splits[i];
|
| 1424 |
+
int split_backend_id = split->backend_id;
|
| 1425 |
+
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
| 1426 |
|
| 1427 |
// copy the input tensors to the split backend
|
| 1428 |
uint64_t copy_start_us = ggml_time_us();
|
| 1429 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1430 |
struct ggml_tensor * input = split->inputs[j];
|
| 1431 |
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
| 1432 |
|
| 1433 |
GGML_ASSERT(input->buffer != NULL);
|
| 1434 |
GGML_ASSERT(input_cpy->buffer != NULL);
|
| 1435 |
|
|
|
|
|
|
|
| 1436 |
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
| 1437 |
}
|
| 1438 |
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
|
|
|
| 1448 |
|
| 1449 |
uint64_t compute_start_us = ggml_time_us();
|
| 1450 |
if (!sched->callback_eval) {
|
| 1451 |
+
if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
|
| 1452 |
+
return false;
|
| 1453 |
+
}
|
| 1454 |
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1455 |
} else {
|
| 1456 |
// similar to ggml_backend_compare_graph_backend
|
|
|
|
| 1470 |
|
| 1471 |
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1472 |
|
| 1473 |
+
if (!ggml_backend_graph_compute(split_backend, &gv)) {
|
| 1474 |
+
return false;
|
| 1475 |
+
}
|
| 1476 |
|
| 1477 |
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
| 1478 |
break;
|
|
|
|
| 1494 |
}
|
| 1495 |
}
|
| 1496 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1497 |
|
| 1498 |
+
return true;
|
| 1499 |
}
|
| 1500 |
|
| 1501 |
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
|
|
|
| 1505 |
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
| 1506 |
|
| 1507 |
// initialize hash table
|
| 1508 |
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
| 1509 |
+
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
| 1510 |
+
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
| 1511 |
+
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
| 1512 |
|
| 1513 |
sched->n_backends = n_backends;
|
| 1514 |
for (int i = 0; i < n_backends; i++) {
|
|
|
|
| 1516 |
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
| 1517 |
}
|
| 1518 |
|
| 1519 |
+
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
| 1520 |
|
| 1521 |
+
ggml_backend_sched_reset(sched);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1522 |
|
| 1523 |
return sched;
|
| 1524 |
}
|
|
|
|
| 1527 |
if (sched == NULL) {
|
| 1528 |
return;
|
| 1529 |
}
|
|
|
|
|
|
|
|
|
|
| 1530 |
ggml_gallocr_free(sched->galloc);
|
| 1531 |
ggml_free(sched->ctx);
|
| 1532 |
free(sched->hash_set.keys);
|
| 1533 |
+
free(sched->tensor_backend_id);
|
| 1534 |
+
free(sched->tensor_copies);
|
| 1535 |
+
free(sched->node_backend_ids);
|
| 1536 |
free(sched);
|
| 1537 |
}
|
| 1538 |
|
| 1539 |
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
| 1540 |
+
// reset state for the next run
|
| 1541 |
+
size_t hash_size = sched->hash_set.size;
|
| 1542 |
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
| 1543 |
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
| 1544 |
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
| 1545 |
|
| 1546 |
+
sched->is_reset = true;
|
| 1547 |
+
}
|
| 1548 |
|
| 1549 |
+
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 1550 |
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
| 1551 |
+
|
| 1552 |
+
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
|
| 1553 |
+
return false;
|
| 1554 |
}
|
| 1555 |
|
| 1556 |
+
ggml_backend_sched_reset(sched);
|
| 1557 |
+
return true;
|
| 1558 |
}
|
| 1559 |
|
| 1560 |
+
bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1561 |
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
| 1562 |
|
| 1563 |
if (!sched->is_reset) {
|
| 1564 |
+
ggml_backend_sched_reset(sched);
|
| 1565 |
}
|
| 1566 |
|
| 1567 |
+
ggml_backend_sched_split_graph(sched, graph);
|
| 1568 |
+
if (!ggml_backend_sched_alloc_splits(sched)) {
|
| 1569 |
+
return false;
|
| 1570 |
+
}
|
| 1571 |
|
| 1572 |
+
if (!ggml_backend_sched_compute_splits(sched)) {
|
| 1573 |
+
return false;
|
| 1574 |
+
}
|
| 1575 |
|
| 1576 |
+
return true;
|
| 1577 |
+
}
|
| 1578 |
|
| 1579 |
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
| 1580 |
sched->callback_eval = callback;
|
|
|
|
| 1585 |
return sched->n_splits;
|
| 1586 |
}
|
| 1587 |
|
| 1588 |
+
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1589 |
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1590 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1591 |
+
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
| 1592 |
}
|
| 1593 |
|
| 1594 |
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
| 1595 |
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 1596 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1597 |
+
tensor_backend_id(node) = backend_index;
|
| 1598 |
}
|
| 1599 |
|
| 1600 |
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
| 1601 |
+
int backend_index = tensor_backend_id(node);
|
| 1602 |
+
if (backend_index == -1) {
|
| 1603 |
return NULL;
|
| 1604 |
}
|
| 1605 |
+
return sched->backends[backend_index];
|
| 1606 |
}
|
| 1607 |
|
| 1608 |
// utils
|
| 1609 |
|
| 1610 |
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 1611 |
GGML_ASSERT(tensor->buffer == NULL);
|
|
|
|
| 1612 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1613 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1614 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
|
|
| 1632 |
ggml_backend_buffer_init_tensor(buffer, tensor);
|
| 1633 |
}
|
| 1634 |
|
| 1635 |
+
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
| 1636 |
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
| 1637 |
|
| 1638 |
GGML_ASSERT(src != NULL);
|
|
|
|
| 1645 |
|
| 1646 |
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
| 1647 |
if (src->view_src != NULL) {
|
| 1648 |
+
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
| 1649 |
dst->view_offs = src->view_offs;
|
| 1650 |
}
|
| 1651 |
dst->op = src->op;
|
|
|
|
| 1658 |
if (s == NULL) {
|
| 1659 |
break;
|
| 1660 |
}
|
| 1661 |
+
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
| 1662 |
}
|
| 1663 |
|
| 1664 |
node_copies[id] = dst;
|
| 1665 |
return dst;
|
| 1666 |
}
|
| 1667 |
|
| 1668 |
+
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
| 1669 |
size_t id = ggml_hash_find(hash_set, src);
|
| 1670 |
if (node_init[id]) {
|
| 1671 |
return;
|
|
|
|
| 1674 |
|
| 1675 |
struct ggml_tensor * dst = node_copies[id];
|
| 1676 |
if (dst->view_src != NULL) {
|
| 1677 |
+
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
| 1678 |
ggml_backend_view_init(dst->view_src->buffer, dst);
|
| 1679 |
}
|
| 1680 |
else {
|
|
|
|
| 1687 |
if (s == NULL) {
|
| 1688 |
break;
|
| 1689 |
}
|
| 1690 |
+
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
| 1691 |
}
|
| 1692 |
}
|
| 1693 |
|
| 1694 |
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
| 1695 |
struct ggml_hash_set hash_set = {
|
| 1696 |
/* .size = */ graph->visited_hash_table.size,
|
| 1697 |
+
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
|
| 1698 |
};
|
| 1699 |
+
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
|
| 1700 |
+
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
|
| 1701 |
|
| 1702 |
struct ggml_init_params params = {
|
| 1703 |
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
|
|
|
| 1726 |
// dup nodes
|
| 1727 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1728 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1729 |
+
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
| 1730 |
}
|
| 1731 |
|
| 1732 |
// allocate nodes
|
|
|
|
| 1751 |
// copy data and init views
|
| 1752 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1753 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1754 |
+
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
| 1755 |
}
|
| 1756 |
|
| 1757 |
// build graph copy
|
ggml-backend.h
CHANGED
|
@@ -130,11 +130,7 @@ extern "C" {
|
|
| 130 |
|
| 131 |
// in build_graph:
|
| 132 |
build_graph(...) {
|
| 133 |
-
//
|
| 134 |
-
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
|
| 135 |
-
ggml_allocr_alloc(alloc_cpu, tensor);
|
| 136 |
-
|
| 137 |
-
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
|
| 138 |
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
| 139 |
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
| 140 |
}
|
|
@@ -164,20 +160,19 @@ extern "C" {
|
|
| 164 |
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
| 165 |
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
| 166 |
// Initialize backend buffers from a measure graph
|
| 167 |
-
GGML_API
|
| 168 |
// Get the number of splits of the last graph
|
| 169 |
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
| 170 |
|
| 171 |
-
GGML_API
|
| 172 |
-
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 173 |
|
| 174 |
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
| 175 |
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
| 176 |
|
| 177 |
// Allocate and compute graph on the backend scheduler
|
| 178 |
-
GGML_API
|
| 179 |
|
| 180 |
-
// Reset all assignments and allocators - must be called before
|
| 181 |
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
| 182 |
|
| 183 |
// Set a callback to be called for each resulting node during graph compute
|
|
|
|
| 130 |
|
| 131 |
// in build_graph:
|
| 132 |
build_graph(...) {
|
| 133 |
+
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
| 135 |
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
| 136 |
}
|
|
|
|
| 160 |
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
| 161 |
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
| 162 |
// Initialize backend buffers from a measure graph
|
| 163 |
+
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
| 164 |
// Get the number of splits of the last graph
|
| 165 |
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
| 166 |
|
| 167 |
+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
|
|
| 168 |
|
| 169 |
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
| 170 |
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
| 171 |
|
| 172 |
// Allocate and compute graph on the backend scheduler
|
| 173 |
+
GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
| 174 |
|
| 175 |
+
// Reset all assignments and allocators - must be called before changing the node backends
|
| 176 |
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
| 177 |
|
| 178 |
// Set a callback to be called for each resulting node during graph compute
|
ggml.c
CHANGED
|
@@ -2607,7 +2607,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
| 2607 |
/*.nb =*/ { 0, 0, 0, 0 },
|
| 2608 |
/*.op =*/ GGML_OP_NONE,
|
| 2609 |
/*.op_params =*/ { 0 },
|
| 2610 |
-
/*.
|
| 2611 |
/*.grad =*/ NULL,
|
| 2612 |
/*.src =*/ { NULL },
|
| 2613 |
/*.perf_runs =*/ 0,
|
|
@@ -6509,7 +6509,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
|
|
| 6509 |
void ggml_set_param(
|
| 6510 |
struct ggml_context * ctx,
|
| 6511 |
struct ggml_tensor * tensor) {
|
| 6512 |
-
tensor->
|
| 6513 |
|
| 6514 |
GGML_ASSERT(tensor->grad == NULL);
|
| 6515 |
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
|
@@ -15311,7 +15311,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
| 15311 |
return NULL;
|
| 15312 |
}
|
| 15313 |
|
| 15314 |
-
if (node->
|
| 15315 |
return node;
|
| 15316 |
}
|
| 15317 |
|
|
@@ -15345,7 +15345,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
| 15345 |
|
| 15346 |
clone->op = node->op;
|
| 15347 |
clone->grad = node->grad;
|
| 15348 |
-
clone->
|
| 15349 |
clone->extra = node->extra;
|
| 15350 |
for (int k = 0; k < GGML_MAX_DIMS; ++k) {
|
| 15351 |
clone->nb[k] = node->nb[k];
|
|
@@ -16377,7 +16377,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|
| 16377 |
for (int i = 0; i < gf->n_nodes; i++) {
|
| 16378 |
struct ggml_tensor * node = gf->nodes[i];
|
| 16379 |
|
| 16380 |
-
if (node->
|
| 16381 |
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
| 16382 |
ggml_build_forward_expand(gb, node->grad);
|
| 16383 |
}
|
|
@@ -17862,7 +17862,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
| 17862 |
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
| 17863 |
i,
|
| 17864 |
node->ne[0], node->ne[1], node->ne[2],
|
| 17865 |
-
ggml_op_name(node->op), node->
|
| 17866 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
| 17867 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
| 17868 |
(double) node->perf_time_us / 1000.0,
|
|
@@ -17955,7 +17955,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
| 17955 |
continue;
|
| 17956 |
}
|
| 17957 |
|
| 17958 |
-
if (node->
|
| 17959 |
snprintf(color, sizeof(color), "yellow");
|
| 17960 |
} else if (node->grad) {
|
| 17961 |
if (ggml_graph_find(gf, node)) {
|
|
@@ -18129,7 +18129,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
| 18129 |
int np = 0;
|
| 18130 |
int64_t nx = 0;
|
| 18131 |
for (int i = 0; i < gf->n_nodes; ++i) {
|
| 18132 |
-
if (gf->nodes[i]->
|
| 18133 |
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
| 18134 |
|
| 18135 |
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
|
@@ -18492,7 +18492,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18492 |
int np = 0;
|
| 18493 |
int nx = 0;
|
| 18494 |
for (int i = 0; i < gf->n_nodes; ++i) {
|
| 18495 |
-
if (gf->nodes[i]->
|
| 18496 |
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
| 18497 |
|
| 18498 |
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
|
@@ -18967,6 +18967,16 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
| 18967 |
|
| 18968 |
////////////////////////////////////////////////////////////////////////////////
|
| 18969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18970 |
void ggml_quantize_init(enum ggml_type type) {
|
| 18971 |
ggml_critical_section_start();
|
| 18972 |
|
|
|
|
| 2607 |
/*.nb =*/ { 0, 0, 0, 0 },
|
| 2608 |
/*.op =*/ GGML_OP_NONE,
|
| 2609 |
/*.op_params =*/ { 0 },
|
| 2610 |
+
/*.flags =*/ 0,
|
| 2611 |
/*.grad =*/ NULL,
|
| 2612 |
/*.src =*/ { NULL },
|
| 2613 |
/*.perf_runs =*/ 0,
|
|
|
|
| 6509 |
void ggml_set_param(
|
| 6510 |
struct ggml_context * ctx,
|
| 6511 |
struct ggml_tensor * tensor) {
|
| 6512 |
+
tensor->flags |= GGML_TENSOR_FLAG_PARAM;
|
| 6513 |
|
| 6514 |
GGML_ASSERT(tensor->grad == NULL);
|
| 6515 |
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
|
|
|
| 15311 |
return NULL;
|
| 15312 |
}
|
| 15313 |
|
| 15314 |
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 15315 |
return node;
|
| 15316 |
}
|
| 15317 |
|
|
|
|
| 15345 |
|
| 15346 |
clone->op = node->op;
|
| 15347 |
clone->grad = node->grad;
|
| 15348 |
+
clone->flags = node->flags;
|
| 15349 |
clone->extra = node->extra;
|
| 15350 |
for (int k = 0; k < GGML_MAX_DIMS; ++k) {
|
| 15351 |
clone->nb[k] = node->nb[k];
|
|
|
|
| 16377 |
for (int i = 0; i < gf->n_nodes; i++) {
|
| 16378 |
struct ggml_tensor * node = gf->nodes[i];
|
| 16379 |
|
| 16380 |
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 16381 |
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
| 16382 |
ggml_build_forward_expand(gb, node->grad);
|
| 16383 |
}
|
|
|
|
| 17862 |
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
| 17863 |
i,
|
| 17864 |
node->ne[0], node->ne[1], node->ne[2],
|
| 17865 |
+
ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
| 17866 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
| 17867 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
| 17868 |
(double) node->perf_time_us / 1000.0,
|
|
|
|
| 17955 |
continue;
|
| 17956 |
}
|
| 17957 |
|
| 17958 |
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 17959 |
snprintf(color, sizeof(color), "yellow");
|
| 17960 |
} else if (node->grad) {
|
| 17961 |
if (ggml_graph_find(gf, node)) {
|
|
|
|
| 18129 |
int np = 0;
|
| 18130 |
int64_t nx = 0;
|
| 18131 |
for (int i = 0; i < gf->n_nodes; ++i) {
|
| 18132 |
+
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 18133 |
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
| 18134 |
|
| 18135 |
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
|
|
|
| 18492 |
int np = 0;
|
| 18493 |
int nx = 0;
|
| 18494 |
for (int i = 0; i < gf->n_nodes; ++i) {
|
| 18495 |
+
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 18496 |
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
| 18497 |
|
| 18498 |
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
|
|
|
| 18967 |
|
| 18968 |
////////////////////////////////////////////////////////////////////////////////
|
| 18969 |
|
| 18970 |
+
void ggml_set_input(struct ggml_tensor * tensor) {
|
| 18971 |
+
tensor->flags |= GGML_TENSOR_FLAG_INPUT;
|
| 18972 |
+
}
|
| 18973 |
+
|
| 18974 |
+
void ggml_set_output(struct ggml_tensor * tensor) {
|
| 18975 |
+
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
| 18976 |
+
}
|
| 18977 |
+
|
| 18978 |
+
////////////////////////////////////////////////////////////////////////////////
|
| 18979 |
+
|
| 18980 |
void ggml_quantize_init(enum ggml_type type) {
|
| 18981 |
ggml_critical_section_start();
|
| 18982 |
|
ggml.h
CHANGED
|
@@ -505,11 +505,17 @@ extern "C" {
|
|
| 505 |
|
| 506 |
enum ggml_log_level {
|
| 507 |
GGML_LOG_LEVEL_ERROR = 2,
|
| 508 |
-
GGML_LOG_LEVEL_WARN
|
| 509 |
-
GGML_LOG_LEVEL_INFO
|
| 510 |
GGML_LOG_LEVEL_DEBUG = 5
|
| 511 |
};
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
// ggml object
|
| 514 |
struct ggml_object {
|
| 515 |
size_t offs;
|
|
@@ -543,7 +549,7 @@ extern "C" {
|
|
| 543 |
// op params - allocated as int32_t for alignment
|
| 544 |
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
| 545 |
|
| 546 |
-
|
| 547 |
|
| 548 |
struct ggml_tensor * grad;
|
| 549 |
struct ggml_tensor * src[GGML_MAX_SRC];
|
|
@@ -2092,6 +2098,12 @@ extern "C" {
|
|
| 2092 |
ggml_opt_callback callback,
|
| 2093 |
void * callback_data);
|
| 2094 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2095 |
//
|
| 2096 |
// quantization
|
| 2097 |
//
|
|
|
|
| 505 |
|
| 506 |
enum ggml_log_level {
|
| 507 |
GGML_LOG_LEVEL_ERROR = 2,
|
| 508 |
+
GGML_LOG_LEVEL_WARN = 3,
|
| 509 |
+
GGML_LOG_LEVEL_INFO = 4,
|
| 510 |
GGML_LOG_LEVEL_DEBUG = 5
|
| 511 |
};
|
| 512 |
|
| 513 |
+
enum ggml_tensor_flag {
|
| 514 |
+
GGML_TENSOR_FLAG_INPUT = 1,
|
| 515 |
+
GGML_TENSOR_FLAG_OUTPUT = 2,
|
| 516 |
+
GGML_TENSOR_FLAG_PARAM = 4,
|
| 517 |
+
};
|
| 518 |
+
|
| 519 |
// ggml object
|
| 520 |
struct ggml_object {
|
| 521 |
size_t offs;
|
|
|
|
| 549 |
// op params - allocated as int32_t for alignment
|
| 550 |
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
| 551 |
|
| 552 |
+
int32_t flags;
|
| 553 |
|
| 554 |
struct ggml_tensor * grad;
|
| 555 |
struct ggml_tensor * src[GGML_MAX_SRC];
|
|
|
|
| 2098 |
ggml_opt_callback callback,
|
| 2099 |
void * callback_data);
|
| 2100 |
|
| 2101 |
+
//
|
| 2102 |
+
// tensor flags
|
| 2103 |
+
//
|
| 2104 |
+
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
| 2105 |
+
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
| 2106 |
+
|
| 2107 |
//
|
| 2108 |
// quantization
|
| 2109 |
//
|
whisper.cpp
CHANGED
|
@@ -471,52 +471,32 @@ struct whisper_pair {
|
|
| 471 |
|
| 472 |
// ggml_allocr wrapper for whisper usage
|
| 473 |
struct whisper_allocr {
|
| 474 |
-
|
| 475 |
|
| 476 |
std::vector<uint8_t> meta;
|
| 477 |
-
|
| 478 |
-
ggml_backend_buffer_t buffer;
|
| 479 |
};
|
| 480 |
|
| 481 |
static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
|
| 482 |
-
return allocr.meta.size() +
|
| 483 |
}
|
| 484 |
|
| 485 |
// measure the memory usage of a graph and prepare the allocr's internal data buffer
|
| 486 |
-
static
|
| 487 |
auto & alloc = allocr.alloc;
|
| 488 |
auto & meta = allocr.meta;
|
| 489 |
|
| 490 |
-
alloc =
|
| 491 |
|
| 492 |
meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
|
| 493 |
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
return;
|
| 501 |
-
}
|
| 502 |
-
|
| 503 |
-
auto & alloc = allocr.alloc;
|
| 504 |
-
auto & buffer = allocr.buffer;
|
| 505 |
-
|
| 506 |
-
size_t size = ggml_allocr_max_size(alloc);
|
| 507 |
-
|
| 508 |
-
ggml_allocr_free(alloc);
|
| 509 |
-
|
| 510 |
-
buffer = ggml_backend_alloc_buffer(backend, size);
|
| 511 |
-
alloc = ggml_allocr_new_from_buffer(buffer);
|
| 512 |
-
}
|
| 513 |
-
|
| 514 |
-
static void whisper_allocr_free(struct whisper_allocr & allocr) {
|
| 515 |
-
if (allocr.alloc) {
|
| 516 |
-
ggml_allocr_free(allocr.alloc);
|
| 517 |
-
ggml_backend_buffer_free(allocr.buffer);
|
| 518 |
-
allocr.alloc = nullptr;
|
| 519 |
}
|
|
|
|
| 520 |
}
|
| 521 |
|
| 522 |
// medium
|
|
@@ -658,9 +638,9 @@ struct whisper_kv_cache {
|
|
| 658 |
struct ggml_tensor * k;
|
| 659 |
struct ggml_tensor * v;
|
| 660 |
|
| 661 |
-
struct ggml_context * ctx;
|
| 662 |
|
| 663 |
-
ggml_backend_buffer_t buffer;
|
| 664 |
};
|
| 665 |
|
| 666 |
struct whisper_model {
|
|
@@ -698,10 +678,10 @@ struct whisper_model {
|
|
| 698 |
std::vector<whisper_layer_decoder> layers_decoder;
|
| 699 |
|
| 700 |
// ggml context that contains all the meta information about the model tensors
|
| 701 |
-
struct ggml_context * ctx;
|
| 702 |
|
| 703 |
// the model backend data is read-only and can be shared between processors
|
| 704 |
-
|
| 705 |
|
| 706 |
// tensors
|
| 707 |
int n_loaded;
|
|
@@ -903,36 +883,26 @@ static bool kv_cache_init(
|
|
| 903 |
cache.ctx = ggml_init(params);
|
| 904 |
|
| 905 |
if (!cache.ctx) {
|
| 906 |
-
WHISPER_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
| 907 |
return false;
|
| 908 |
}
|
| 909 |
|
| 910 |
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
| 911 |
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
| 912 |
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
// allocate the tensors into the backend buffer
|
| 918 |
-
{
|
| 919 |
-
ggml_allocr * alloc = ggml_allocr_new_from_buffer(cache.buffer);
|
| 920 |
-
|
| 921 |
-
ggml_allocr_alloc(alloc, cache.k);
|
| 922 |
-
ggml_allocr_alloc(alloc, cache.v);
|
| 923 |
-
|
| 924 |
-
ggml_allocr_free(alloc);
|
| 925 |
}
|
| 926 |
|
| 927 |
return true;
|
| 928 |
}
|
| 929 |
|
| 930 |
static void kv_cache_free(struct whisper_kv_cache & cache) {
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
cache.ctx = nullptr;
|
| 935 |
-
}
|
| 936 |
}
|
| 937 |
|
| 938 |
static bool whisper_kv_cache_find_slot(
|
|
@@ -1513,68 +1483,21 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1513 |
}
|
| 1514 |
|
| 1515 |
wctx.backend = whisper_backend_init(wctx.params);
|
| 1516 |
-
|
| 1517 |
-
|
| 1518 |
-
|
| 1519 |
-
// to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
|
| 1520 |
-
// model weights between them
|
| 1521 |
-
//
|
| 1522 |
-
// the map_t2b maps tensor names to buffer indices
|
| 1523 |
-
// as we iterate over the tensors, we will allocate new buffers when the current one is full
|
| 1524 |
-
//
|
| 1525 |
-
// finally, we create a separate allocator for each buffer and use it to allocate the tensors
|
| 1526 |
-
// we keep the allocators alive until all the tensors are loaded
|
| 1527 |
-
|
| 1528 |
-
GGML_ASSERT(model.buffers.empty());
|
| 1529 |
-
|
| 1530 |
-
std::map<std::string, int> map_t2b;
|
| 1531 |
-
|
| 1532 |
-
{
|
| 1533 |
-
size_t size_main = 0;
|
| 1534 |
-
size_t size_cur = 0;
|
| 1535 |
-
|
| 1536 |
-
static const size_t GB = 1024ull*1024ull*1024ull;
|
| 1537 |
-
|
| 1538 |
-
for (const auto & t : model.tensors) {
|
| 1539 |
-
const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
|
| 1540 |
-
|
| 1541 |
-
// adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
|
| 1542 |
-
if (size_cur + cur > GB) {
|
| 1543 |
-
GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
|
| 1544 |
-
|
| 1545 |
-
model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
|
| 1546 |
-
|
| 1547 |
-
size_cur = cur;
|
| 1548 |
-
}
|
| 1549 |
-
|
| 1550 |
-
map_t2b[t.first] = model.buffers.size();
|
| 1551 |
-
|
| 1552 |
-
size_cur += cur;
|
| 1553 |
-
size_main += cur;
|
| 1554 |
-
}
|
| 1555 |
-
|
| 1556 |
-
// allocate the last buffer if needed
|
| 1557 |
-
if (size_cur > 0) {
|
| 1558 |
-
model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
|
| 1559 |
-
}
|
| 1560 |
-
|
| 1561 |
-
GGML_ASSERT(model.buffers.size() > 0);
|
| 1562 |
-
|
| 1563 |
-
WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
|
| 1564 |
-
}
|
| 1565 |
-
|
| 1566 |
-
std::vector<ggml_allocr *> allocs(model.buffers.size());
|
| 1567 |
-
for (size_t i = 0; i < allocs.size(); ++i) {
|
| 1568 |
-
allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
|
| 1569 |
}
|
| 1570 |
|
| 1571 |
// allocate tensors in the backend buffers
|
| 1572 |
-
|
| 1573 |
-
|
| 1574 |
-
|
| 1575 |
-
|
| 1576 |
}
|
| 1577 |
|
|
|
|
|
|
|
|
|
|
| 1578 |
// load weights
|
| 1579 |
{
|
| 1580 |
size_t total_size = 0;
|
|
@@ -1636,15 +1559,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1636 |
return false;
|
| 1637 |
}
|
| 1638 |
|
| 1639 |
-
ggml_backend_t backend = wctx.backend;
|
| 1640 |
|
| 1641 |
//printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
|
| 1642 |
|
| 1643 |
-
if ((
|
| 1644 |
-
#ifdef GGML_USE_METAL
|
| 1645 |
-
|| ggml_backend_is_metal(backend)
|
| 1646 |
-
#endif
|
| 1647 |
-
)) {
|
| 1648 |
// for the CPU and Metal backend, we can read directly into the tensor
|
| 1649 |
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
|
| 1650 |
BYTESWAP_TENSOR(tensor);
|
|
@@ -1672,10 +1591,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1672 |
}
|
| 1673 |
}
|
| 1674 |
|
| 1675 |
-
for (auto & alloc : allocs) {
|
| 1676 |
-
ggml_allocr_free(alloc);
|
| 1677 |
-
}
|
| 1678 |
-
|
| 1679 |
wctx.t_load_us = ggml_time_us() - t_start_us;
|
| 1680 |
|
| 1681 |
return true;
|
|
@@ -1704,7 +1619,6 @@ static struct ggml_cgraph * whisper_build_graph_conv(
|
|
| 1704 |
whisper_state & wstate,
|
| 1705 |
const int mel_offset) {
|
| 1706 |
const auto & model = wctx.model;
|
| 1707 |
-
const auto & mel_inp = wstate.mel;
|
| 1708 |
const auto & hparams = model.hparams;
|
| 1709 |
|
| 1710 |
const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
|
|
@@ -1722,31 +1636,9 @@ static struct ggml_cgraph * whisper_build_graph_conv(
|
|
| 1722 |
|
| 1723 |
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
| 1724 |
|
| 1725 |
-
ggml_allocr * alloc = wstate.alloc_conv.alloc;
|
| 1726 |
-
|
| 1727 |
struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
|
| 1728 |
-
|
| 1729 |
-
|
| 1730 |
-
assert(mel->type == GGML_TYPE_F32);
|
| 1731 |
-
if (!ggml_allocr_is_measure(alloc)) {
|
| 1732 |
-
assert(mel_inp.n_mel == n_mels);
|
| 1733 |
-
|
| 1734 |
-
wstate.inp_mel.resize(ggml_nelements(mel));
|
| 1735 |
-
|
| 1736 |
-
float * dst = wstate.inp_mel.data();
|
| 1737 |
-
memset(dst, 0, ggml_nbytes(mel));
|
| 1738 |
-
|
| 1739 |
-
const int i0 = std::min(mel_offset, mel_inp.n_len);
|
| 1740 |
-
const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
|
| 1741 |
-
|
| 1742 |
-
for (int j = 0; j < mel_inp.n_mel; ++j) {
|
| 1743 |
-
for (int i = i0; i < i1; ++i) {
|
| 1744 |
-
dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
|
| 1745 |
-
}
|
| 1746 |
-
}
|
| 1747 |
-
|
| 1748 |
-
ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
|
| 1749 |
-
}
|
| 1750 |
|
| 1751 |
struct ggml_tensor * cur = nullptr;
|
| 1752 |
|
|
@@ -2138,11 +2030,39 @@ static bool whisper_encode_internal(
|
|
| 2138 |
{
|
| 2139 |
auto & alloc = wstate.alloc_conv.alloc;
|
| 2140 |
|
| 2141 |
-
ggml_allocr_reset(alloc);
|
| 2142 |
-
|
| 2143 |
ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
|
| 2144 |
|
| 2145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2146 |
|
| 2147 |
if (!whisper_encode_external(wstate)) {
|
| 2148 |
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
|
@@ -2155,11 +2075,12 @@ static bool whisper_encode_internal(
|
|
| 2155 |
if (!whisper_encode_external(wstate)) {
|
| 2156 |
auto & alloc = wstate.alloc_encode.alloc;
|
| 2157 |
|
| 2158 |
-
ggml_allocr_reset(alloc);
|
| 2159 |
-
|
| 2160 |
ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
|
| 2161 |
|
| 2162 |
-
|
|
|
|
|
|
|
|
|
|
| 2163 |
|
| 2164 |
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2165 |
return false;
|
|
@@ -2170,11 +2091,12 @@ static bool whisper_encode_internal(
|
|
| 2170 |
{
|
| 2171 |
auto & alloc = wstate.alloc_cross.alloc;
|
| 2172 |
|
| 2173 |
-
ggml_allocr_reset(alloc);
|
| 2174 |
-
|
| 2175 |
ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
|
| 2176 |
|
| 2177 |
-
|
|
|
|
|
|
|
|
|
|
| 2178 |
|
| 2179 |
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2180 |
return false;
|
|
@@ -2190,7 +2112,8 @@ static bool whisper_encode_internal(
|
|
| 2190 |
static struct ggml_cgraph * whisper_build_graph_decoder(
|
| 2191 |
whisper_context & wctx,
|
| 2192 |
whisper_state & wstate,
|
| 2193 |
-
const whisper_batch & batch
|
|
|
|
| 2194 |
const auto & model = wctx.model;
|
| 2195 |
const auto & hparams = model.hparams;
|
| 2196 |
|
|
@@ -2198,8 +2121,6 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|
| 2198 |
|
| 2199 |
WHISPER_ASSERT(!!kv_self.ctx);
|
| 2200 |
|
| 2201 |
-
ggml_allocr * alloc = wstate.alloc_decode.alloc;
|
| 2202 |
-
|
| 2203 |
const int n_ctx = kv_self.size;
|
| 2204 |
const int n_state = hparams.n_text_state;
|
| 2205 |
const int n_head = hparams.n_text_head;
|
|
@@ -2208,8 +2129,8 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|
| 2208 |
const int n_tokens = batch.n_tokens;
|
| 2209 |
const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
|
| 2210 |
|
| 2211 |
-
const int32_t n_kv =
|
| 2212 |
-
const int32_t kv_head =
|
| 2213 |
|
| 2214 |
//WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
|
| 2215 |
|
|
@@ -2224,48 +2145,18 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|
| 2224 |
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
| 2225 |
|
| 2226 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
| 2227 |
-
|
| 2228 |
-
|
| 2229 |
-
if (!ggml_allocr_is_measure(alloc)) {
|
| 2230 |
-
ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
|
| 2231 |
-
}
|
| 2232 |
|
| 2233 |
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
| 2234 |
-
|
| 2235 |
-
|
| 2236 |
-
if (!ggml_allocr_is_measure(alloc)) {
|
| 2237 |
-
for (int i = 0; i < n_tokens; ++i) {
|
| 2238 |
-
const int32_t val = batch.pos[i];
|
| 2239 |
-
ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
|
| 2240 |
-
}
|
| 2241 |
-
}
|
| 2242 |
|
| 2243 |
const float KQscale = pow(float(n_state)/n_head, -0.25);
|
| 2244 |
|
| 2245 |
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
| 2246 |
-
|
| 2247 |
-
|
| 2248 |
-
if (!ggml_allocr_is_measure(alloc)) {
|
| 2249 |
-
wstate.inp_mask.resize(n_kv*n_tokens);
|
| 2250 |
-
|
| 2251 |
-
float * data = wstate.inp_mask.data();
|
| 2252 |
-
memset(data, 0, ggml_nbytes(KQ_mask));
|
| 2253 |
-
|
| 2254 |
-
for (int h = 0; h < 1; ++h) {
|
| 2255 |
-
for (int j = 0; j < n_tokens; ++j) {
|
| 2256 |
-
const whisper_pos pos = batch.pos[j];
|
| 2257 |
-
const whisper_seq_id seq_id = batch.seq_id[j][0];
|
| 2258 |
-
|
| 2259 |
-
for (int i = 0; i < n_kv; ++i) {
|
| 2260 |
-
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
| 2261 |
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
| 2262 |
-
}
|
| 2263 |
-
}
|
| 2264 |
-
}
|
| 2265 |
-
}
|
| 2266 |
-
|
| 2267 |
-
ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
|
| 2268 |
-
}
|
| 2269 |
|
| 2270 |
// token encoding + position encoding
|
| 2271 |
struct ggml_tensor * cur =
|
|
@@ -2592,11 +2483,53 @@ static bool whisper_decode_internal(
|
|
| 2592 |
{
|
| 2593 |
auto & alloc = wstate.alloc_decode.alloc;
|
| 2594 |
|
| 2595 |
-
|
| 2596 |
|
| 2597 |
-
|
|
|
|
|
|
|
|
|
|
| 2598 |
|
| 2599 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2600 |
|
| 2601 |
logits = gf->nodes[gf->n_nodes - 1];
|
| 2602 |
|
|
@@ -3046,6 +2979,11 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 3046 |
whisper_state * state = new whisper_state;
|
| 3047 |
|
| 3048 |
state->backend = whisper_backend_init(ctx->params);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3049 |
|
| 3050 |
// at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
|
| 3051 |
// in theory, there can be a case where this is not enough, but in practice it should always be enough
|
|
@@ -3053,7 +2991,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 3053 |
|
| 3054 |
if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
|
| 3055 |
WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
| 3056 |
-
|
| 3057 |
return nullptr;
|
| 3058 |
}
|
| 3059 |
|
|
@@ -3064,7 +3002,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 3064 |
|
| 3065 |
if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
|
| 3066 |
WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
|
| 3067 |
-
|
| 3068 |
return nullptr;
|
| 3069 |
}
|
| 3070 |
|
|
@@ -3083,7 +3021,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 3083 |
if (!state->ctx_coreml) {
|
| 3084 |
WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
| 3085 |
#ifndef WHISPER_COREML_ALLOW_FALLBACK
|
| 3086 |
-
|
| 3087 |
return nullptr;
|
| 3088 |
#endif
|
| 3089 |
} else {
|
|
@@ -3107,37 +3045,55 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 3107 |
|
| 3108 |
// conv allocator
|
| 3109 |
{
|
| 3110 |
-
whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
|
| 3111 |
[&]() {
|
| 3112 |
return whisper_build_graph_conv(*ctx, *state, 0);
|
| 3113 |
});
|
| 3114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3115 |
WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
|
| 3116 |
}
|
| 3117 |
|
| 3118 |
// encoder allocator
|
| 3119 |
if (!whisper_encode_external(*state)) {
|
| 3120 |
-
whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
|
| 3121 |
[&]() {
|
| 3122 |
return whisper_build_graph_encoder(*ctx, *state);
|
| 3123 |
});
|
| 3124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3125 |
WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
|
| 3126 |
}
|
| 3127 |
|
| 3128 |
// cross allocator
|
| 3129 |
{
|
| 3130 |
-
whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
|
| 3131 |
[&]() {
|
| 3132 |
return whisper_build_graph_cross(*ctx, *state);
|
| 3133 |
});
|
| 3134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3135 |
WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
|
| 3136 |
}
|
| 3137 |
|
| 3138 |
// decoder allocator
|
| 3139 |
{
|
| 3140 |
-
whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
|
| 3141 |
[&]() {
|
| 3142 |
const auto & hparams = ctx->model.hparams;
|
| 3143 |
|
|
@@ -3147,17 +3103,18 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 3147 |
|
| 3148 |
whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
|
| 3149 |
|
| 3150 |
-
return whisper_build_graph_decoder(*ctx, *state, state->batch);
|
| 3151 |
});
|
| 3152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3153 |
WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
|
| 3154 |
}
|
| 3155 |
|
| 3156 |
-
whisper_allocr_graph_realloc(state->alloc_conv, ctx->backend);
|
| 3157 |
-
whisper_allocr_graph_realloc(state->alloc_encode, ctx->backend);
|
| 3158 |
-
whisper_allocr_graph_realloc(state->alloc_cross, ctx->backend);
|
| 3159 |
-
whisper_allocr_graph_realloc(state->alloc_decode, ctx->backend);
|
| 3160 |
-
|
| 3161 |
return state;
|
| 3162 |
}
|
| 3163 |
|
|
@@ -3380,8 +3337,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa
|
|
| 3380 |
return whisper_init_with_params_no_state(loader, whisper_context_default_params());
|
| 3381 |
}
|
| 3382 |
|
| 3383 |
-
void whisper_free_state(struct whisper_state * state)
|
| 3384 |
-
{
|
| 3385 |
if (state) {
|
| 3386 |
kv_cache_free(state->kv_self);
|
| 3387 |
kv_cache_free(state->kv_cross);
|
|
@@ -3402,10 +3358,10 @@ void whisper_free_state(struct whisper_state * state)
|
|
| 3402 |
|
| 3403 |
whisper_batch_free(state->batch);
|
| 3404 |
|
| 3405 |
-
|
| 3406 |
-
|
| 3407 |
-
|
| 3408 |
-
|
| 3409 |
|
| 3410 |
ggml_backend_free(state->backend);
|
| 3411 |
|
|
@@ -3415,15 +3371,9 @@ void whisper_free_state(struct whisper_state * state)
|
|
| 3415 |
|
| 3416 |
void whisper_free(struct whisper_context * ctx) {
|
| 3417 |
if (ctx) {
|
| 3418 |
-
|
| 3419 |
-
ggml_free(ctx->model.ctx);
|
| 3420 |
-
}
|
| 3421 |
|
| 3422 |
-
|
| 3423 |
-
if (buffer) {
|
| 3424 |
-
ggml_backend_buffer_free(buffer);
|
| 3425 |
-
}
|
| 3426 |
-
}
|
| 3427 |
|
| 3428 |
whisper_free_state(ctx->state);
|
| 3429 |
|
|
|
|
| 471 |
|
| 472 |
// ggml_allocr wrapper for whisper usage
|
| 473 |
struct whisper_allocr {
|
| 474 |
+
ggml_gallocr_t alloc = nullptr;
|
| 475 |
|
| 476 |
std::vector<uint8_t> meta;
|
|
|
|
|
|
|
| 477 |
};
|
| 478 |
|
| 479 |
static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
|
| 480 |
+
return allocr.meta.size() + ggml_gallocr_get_buffer_size(allocr.alloc, 0);
|
| 481 |
}
|
| 482 |
|
| 483 |
// measure the memory usage of a graph and prepare the allocr's internal data buffer
|
| 484 |
+
static bool whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
|
| 485 |
auto & alloc = allocr.alloc;
|
| 486 |
auto & meta = allocr.meta;
|
| 487 |
|
| 488 |
+
alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
| 489 |
|
| 490 |
meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
|
| 491 |
|
| 492 |
+
// since there are dependencies between the different graphs,
|
| 493 |
+
// we need to allocate them instead of only reserving to get the correct compute buffer size
|
| 494 |
+
if (!ggml_gallocr_alloc_graph(alloc, get_graph())) {
|
| 495 |
+
// failed to allocate the compute buffer
|
| 496 |
+
WHISPER_LOG_ERROR("%s: failed to allocate the compute buffer\n", __func__);
|
| 497 |
+
return false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
}
|
| 499 |
+
return true;
|
| 500 |
}
|
| 501 |
|
| 502 |
// medium
|
|
|
|
| 638 |
struct ggml_tensor * k;
|
| 639 |
struct ggml_tensor * v;
|
| 640 |
|
| 641 |
+
struct ggml_context * ctx = nullptr;
|
| 642 |
|
| 643 |
+
ggml_backend_buffer_t buffer = nullptr;
|
| 644 |
};
|
| 645 |
|
| 646 |
struct whisper_model {
|
|
|
|
| 678 |
std::vector<whisper_layer_decoder> layers_decoder;
|
| 679 |
|
| 680 |
// ggml context that contains all the meta information about the model tensors
|
| 681 |
+
struct ggml_context * ctx = nullptr;
|
| 682 |
|
| 683 |
// the model backend data is read-only and can be shared between processors
|
| 684 |
+
ggml_backend_buffer_t buffer = nullptr;
|
| 685 |
|
| 686 |
// tensors
|
| 687 |
int n_loaded;
|
|
|
|
| 883 |
cache.ctx = ggml_init(params);
|
| 884 |
|
| 885 |
if (!cache.ctx) {
|
| 886 |
+
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
|
| 887 |
return false;
|
| 888 |
}
|
| 889 |
|
| 890 |
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
| 891 |
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
| 892 |
|
| 893 |
+
cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend);
|
| 894 |
+
if (!cache.buffer) {
|
| 895 |
+
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
|
| 896 |
+
return false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
}
|
| 898 |
|
| 899 |
return true;
|
| 900 |
}
|
| 901 |
|
| 902 |
static void kv_cache_free(struct whisper_kv_cache & cache) {
|
| 903 |
+
ggml_free(cache.ctx);
|
| 904 |
+
ggml_backend_buffer_free(cache.buffer);
|
| 905 |
+
cache.ctx = nullptr;
|
|
|
|
|
|
|
| 906 |
}
|
| 907 |
|
| 908 |
static bool whisper_kv_cache_find_slot(
|
|
|
|
| 1483 |
}
|
| 1484 |
|
| 1485 |
wctx.backend = whisper_backend_init(wctx.params);
|
| 1486 |
+
if (!wctx.backend) {
|
| 1487 |
+
WHISPER_LOG_ERROR("%s: failed to initialize the backend\n", __func__);
|
| 1488 |
+
return false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1489 |
}
|
| 1490 |
|
| 1491 |
// allocate tensors in the backend buffers
|
| 1492 |
+
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, wctx.backend);
|
| 1493 |
+
if (!model.buffer) {
|
| 1494 |
+
WHISPER_LOG_ERROR("%s: failed to allocate memory for the model\n", __func__);
|
| 1495 |
+
return false;
|
| 1496 |
}
|
| 1497 |
|
| 1498 |
+
size_t size_main = ggml_backend_buffer_get_size(model.buffer);
|
| 1499 |
+
WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6);
|
| 1500 |
+
|
| 1501 |
// load weights
|
| 1502 |
{
|
| 1503 |
size_t total_size = 0;
|
|
|
|
| 1559 |
return false;
|
| 1560 |
}
|
| 1561 |
|
| 1562 |
+
//ggml_backend_t backend = wctx.backend;
|
| 1563 |
|
| 1564 |
//printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
|
| 1565 |
|
| 1566 |
+
if (ggml_backend_buffer_is_host(model.buffer)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1567 |
// for the CPU and Metal backend, we can read directly into the tensor
|
| 1568 |
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
|
| 1569 |
BYTESWAP_TENSOR(tensor);
|
|
|
|
| 1591 |
}
|
| 1592 |
}
|
| 1593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1594 |
wctx.t_load_us = ggml_time_us() - t_start_us;
|
| 1595 |
|
| 1596 |
return true;
|
|
|
|
| 1619 |
whisper_state & wstate,
|
| 1620 |
const int mel_offset) {
|
| 1621 |
const auto & model = wctx.model;
|
|
|
|
| 1622 |
const auto & hparams = model.hparams;
|
| 1623 |
|
| 1624 |
const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
|
|
|
|
| 1636 |
|
| 1637 |
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
| 1638 |
|
|
|
|
|
|
|
| 1639 |
struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
|
| 1640 |
+
ggml_set_name(mel, "mel");
|
| 1641 |
+
ggml_set_input(mel);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1642 |
|
| 1643 |
struct ggml_tensor * cur = nullptr;
|
| 1644 |
|
|
|
|
| 2030 |
{
|
| 2031 |
auto & alloc = wstate.alloc_conv.alloc;
|
| 2032 |
|
|
|
|
|
|
|
| 2033 |
ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
|
| 2034 |
|
| 2035 |
+
if (!ggml_gallocr_alloc_graph(alloc, gf)) {
|
| 2036 |
+
// should never happen as we pre-allocate the memory
|
| 2037 |
+
return false;
|
| 2038 |
+
}
|
| 2039 |
+
|
| 2040 |
+
// set the input
|
| 2041 |
+
{
|
| 2042 |
+
const auto & mel_inp = wstate.mel;
|
| 2043 |
+
const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx;
|
| 2044 |
+
|
| 2045 |
+
struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
|
| 2046 |
+
|
| 2047 |
+
assert(mel->type == GGML_TYPE_F32);
|
| 2048 |
+
assert(mel_inp.n_mel == wctx.model.hparams.n_mels);
|
| 2049 |
+
|
| 2050 |
+
wstate.inp_mel.resize(ggml_nelements(mel));
|
| 2051 |
+
|
| 2052 |
+
float * dst = wstate.inp_mel.data();
|
| 2053 |
+
memset(dst, 0, ggml_nbytes(mel));
|
| 2054 |
+
|
| 2055 |
+
const int i0 = std::min(mel_offset, mel_inp.n_len);
|
| 2056 |
+
const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
|
| 2057 |
+
|
| 2058 |
+
for (int j = 0; j < mel_inp.n_mel; ++j) {
|
| 2059 |
+
for (int i = i0; i < i1; ++i) {
|
| 2060 |
+
dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
|
| 2061 |
+
}
|
| 2062 |
+
}
|
| 2063 |
+
|
| 2064 |
+
ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
|
| 2065 |
+
}
|
| 2066 |
|
| 2067 |
if (!whisper_encode_external(wstate)) {
|
| 2068 |
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
|
|
|
| 2075 |
if (!whisper_encode_external(wstate)) {
|
| 2076 |
auto & alloc = wstate.alloc_encode.alloc;
|
| 2077 |
|
|
|
|
|
|
|
| 2078 |
ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
|
| 2079 |
|
| 2080 |
+
if (!ggml_gallocr_alloc_graph(alloc, gf)) {
|
| 2081 |
+
// should never happen as we pre-allocate the memory
|
| 2082 |
+
return false;
|
| 2083 |
+
}
|
| 2084 |
|
| 2085 |
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2086 |
return false;
|
|
|
|
| 2091 |
{
|
| 2092 |
auto & alloc = wstate.alloc_cross.alloc;
|
| 2093 |
|
|
|
|
|
|
|
| 2094 |
ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
|
| 2095 |
|
| 2096 |
+
if (!ggml_gallocr_alloc_graph(alloc, gf)) {
|
| 2097 |
+
// should never happen as we pre-allocate the memory
|
| 2098 |
+
return false;
|
| 2099 |
+
}
|
| 2100 |
|
| 2101 |
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2102 |
return false;
|
|
|
|
| 2112 |
static struct ggml_cgraph * whisper_build_graph_decoder(
|
| 2113 |
whisper_context & wctx,
|
| 2114 |
whisper_state & wstate,
|
| 2115 |
+
const whisper_batch & batch,
|
| 2116 |
+
bool worst_case) {
|
| 2117 |
const auto & model = wctx.model;
|
| 2118 |
const auto & hparams = model.hparams;
|
| 2119 |
|
|
|
|
| 2121 |
|
| 2122 |
WHISPER_ASSERT(!!kv_self.ctx);
|
| 2123 |
|
|
|
|
|
|
|
| 2124 |
const int n_ctx = kv_self.size;
|
| 2125 |
const int n_state = hparams.n_text_state;
|
| 2126 |
const int n_head = hparams.n_text_head;
|
|
|
|
| 2129 |
const int n_tokens = batch.n_tokens;
|
| 2130 |
const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
|
| 2131 |
|
| 2132 |
+
const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
|
| 2133 |
+
const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
|
| 2134 |
|
| 2135 |
//WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
|
| 2136 |
|
|
|
|
| 2145 |
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
| 2146 |
|
| 2147 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
| 2148 |
+
ggml_set_name(embd, "embd");
|
| 2149 |
+
ggml_set_input(embd);
|
|
|
|
|
|
|
|
|
|
| 2150 |
|
| 2151 |
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
| 2152 |
+
ggml_set_name(position, "position");
|
| 2153 |
+
ggml_set_input(position);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2154 |
|
| 2155 |
const float KQscale = pow(float(n_state)/n_head, -0.25);
|
| 2156 |
|
| 2157 |
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
| 2158 |
+
ggml_set_name(KQ_mask, "KQ_mask");
|
| 2159 |
+
ggml_set_input(KQ_mask);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2160 |
|
| 2161 |
// token encoding + position encoding
|
| 2162 |
struct ggml_tensor * cur =
|
|
|
|
| 2483 |
{
|
| 2484 |
auto & alloc = wstate.alloc_decode.alloc;
|
| 2485 |
|
| 2486 |
+
ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch, false);
|
| 2487 |
|
| 2488 |
+
if (!ggml_gallocr_alloc_graph(alloc, gf)) {
|
| 2489 |
+
// should never happen as we pre-allocate the memory
|
| 2490 |
+
return false;
|
| 2491 |
+
}
|
| 2492 |
|
| 2493 |
+
// set the inputs
|
| 2494 |
+
{
|
| 2495 |
+
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
|
| 2496 |
+
ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
|
| 2497 |
+
}
|
| 2498 |
+
|
| 2499 |
+
{
|
| 2500 |
+
struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
|
| 2501 |
+
for (int i = 0; i < n_tokens; ++i) {
|
| 2502 |
+
const int32_t val = batch.pos[i];
|
| 2503 |
+
ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
|
| 2504 |
+
}
|
| 2505 |
+
}
|
| 2506 |
+
|
| 2507 |
+
{
|
| 2508 |
+
struct ggml_tensor * KQ_mask = ggml_graph_get_tensor(gf, "KQ_mask");
|
| 2509 |
+
|
| 2510 |
+
auto & kv_self = wstate.kv_self;
|
| 2511 |
+
const int32_t n_kv = kv_self.n;
|
| 2512 |
+
|
| 2513 |
+
wstate.inp_mask.resize(n_kv*n_tokens);
|
| 2514 |
+
|
| 2515 |
+
float * data = wstate.inp_mask.data();
|
| 2516 |
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
| 2517 |
+
|
| 2518 |
+
for (int h = 0; h < 1; ++h) {
|
| 2519 |
+
for (int j = 0; j < n_tokens; ++j) {
|
| 2520 |
+
const whisper_pos pos = batch.pos[j];
|
| 2521 |
+
const whisper_seq_id seq_id = batch.seq_id[j][0];
|
| 2522 |
+
|
| 2523 |
+
for (int i = 0; i < n_kv; ++i) {
|
| 2524 |
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
| 2525 |
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
| 2526 |
+
}
|
| 2527 |
+
}
|
| 2528 |
+
}
|
| 2529 |
+
}
|
| 2530 |
+
|
| 2531 |
+
ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
|
| 2532 |
+
}
|
| 2533 |
|
| 2534 |
logits = gf->nodes[gf->n_nodes - 1];
|
| 2535 |
|
|
|
|
| 2979 |
whisper_state * state = new whisper_state;
|
| 2980 |
|
| 2981 |
state->backend = whisper_backend_init(ctx->params);
|
| 2982 |
+
if (!state->backend) {
|
| 2983 |
+
WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__);
|
| 2984 |
+
whisper_free_state(state);
|
| 2985 |
+
return nullptr;
|
| 2986 |
+
}
|
| 2987 |
|
| 2988 |
// at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
|
| 2989 |
// in theory, there can be a case where this is not enough, but in practice it should always be enough
|
|
|
|
| 2991 |
|
| 2992 |
if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
|
| 2993 |
WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
| 2994 |
+
whisper_free_state(state);
|
| 2995 |
return nullptr;
|
| 2996 |
}
|
| 2997 |
|
|
|
|
| 3002 |
|
| 3003 |
if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
|
| 3004 |
WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
|
| 3005 |
+
whisper_free_state(state);
|
| 3006 |
return nullptr;
|
| 3007 |
}
|
| 3008 |
|
|
|
|
| 3021 |
if (!state->ctx_coreml) {
|
| 3022 |
WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
| 3023 |
#ifndef WHISPER_COREML_ALLOW_FALLBACK
|
| 3024 |
+
whisper_free_state(state);
|
| 3025 |
return nullptr;
|
| 3026 |
#endif
|
| 3027 |
} else {
|
|
|
|
| 3045 |
|
| 3046 |
// conv allocator
|
| 3047 |
{
|
| 3048 |
+
bool ok = whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
|
| 3049 |
[&]() {
|
| 3050 |
return whisper_build_graph_conv(*ctx, *state, 0);
|
| 3051 |
});
|
| 3052 |
|
| 3053 |
+
if (!ok) {
|
| 3054 |
+
WHISPER_LOG_ERROR("%s: failed to init conv allocator\n", __func__);
|
| 3055 |
+
whisper_free_state(state);
|
| 3056 |
+
return nullptr;
|
| 3057 |
+
}
|
| 3058 |
+
|
| 3059 |
WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
|
| 3060 |
}
|
| 3061 |
|
| 3062 |
// encoder allocator
|
| 3063 |
if (!whisper_encode_external(*state)) {
|
| 3064 |
+
bool ok = whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
|
| 3065 |
[&]() {
|
| 3066 |
return whisper_build_graph_encoder(*ctx, *state);
|
| 3067 |
});
|
| 3068 |
|
| 3069 |
+
if (!ok) {
|
| 3070 |
+
WHISPER_LOG_ERROR("%s: failed to init encoder allocator\n", __func__);
|
| 3071 |
+
whisper_free_state(state);
|
| 3072 |
+
return nullptr;
|
| 3073 |
+
}
|
| 3074 |
+
|
| 3075 |
WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
|
| 3076 |
}
|
| 3077 |
|
| 3078 |
// cross allocator
|
| 3079 |
{
|
| 3080 |
+
bool ok = whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
|
| 3081 |
[&]() {
|
| 3082 |
return whisper_build_graph_cross(*ctx, *state);
|
| 3083 |
});
|
| 3084 |
|
| 3085 |
+
if (!ok) {
|
| 3086 |
+
WHISPER_LOG_ERROR("%s: failed to init cross allocator\n", __func__);
|
| 3087 |
+
whisper_free_state(state);
|
| 3088 |
+
return nullptr;
|
| 3089 |
+
}
|
| 3090 |
+
|
| 3091 |
WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
|
| 3092 |
}
|
| 3093 |
|
| 3094 |
// decoder allocator
|
| 3095 |
{
|
| 3096 |
+
bool ok = whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
|
| 3097 |
[&]() {
|
| 3098 |
const auto & hparams = ctx->model.hparams;
|
| 3099 |
|
|
|
|
| 3103 |
|
| 3104 |
whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
|
| 3105 |
|
| 3106 |
+
return whisper_build_graph_decoder(*ctx, *state, state->batch, true);
|
| 3107 |
});
|
| 3108 |
|
| 3109 |
+
if (!ok) {
|
| 3110 |
+
WHISPER_LOG_ERROR("%s: failed to init decoder allocator\n", __func__);
|
| 3111 |
+
whisper_free_state(state);
|
| 3112 |
+
return nullptr;
|
| 3113 |
+
}
|
| 3114 |
+
|
| 3115 |
WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
|
| 3116 |
}
|
| 3117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3118 |
return state;
|
| 3119 |
}
|
| 3120 |
|
|
|
|
| 3337 |
return whisper_init_with_params_no_state(loader, whisper_context_default_params());
|
| 3338 |
}
|
| 3339 |
|
| 3340 |
+
void whisper_free_state(struct whisper_state * state) {
|
|
|
|
| 3341 |
if (state) {
|
| 3342 |
kv_cache_free(state->kv_self);
|
| 3343 |
kv_cache_free(state->kv_cross);
|
|
|
|
| 3358 |
|
| 3359 |
whisper_batch_free(state->batch);
|
| 3360 |
|
| 3361 |
+
ggml_gallocr_free(state->alloc_conv.alloc);
|
| 3362 |
+
ggml_gallocr_free(state->alloc_encode.alloc);
|
| 3363 |
+
ggml_gallocr_free(state->alloc_cross.alloc);
|
| 3364 |
+
ggml_gallocr_free(state->alloc_decode.alloc);
|
| 3365 |
|
| 3366 |
ggml_backend_free(state->backend);
|
| 3367 |
|
|
|
|
| 3371 |
|
| 3372 |
void whisper_free(struct whisper_context * ctx) {
|
| 3373 |
if (ctx) {
|
| 3374 |
+
ggml_free(ctx->model.ctx);
|
|
|
|
|
|
|
| 3375 |
|
| 3376 |
+
ggml_backend_buffer_free(ctx->model.buffer);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3377 |
|
| 3378 |
whisper_free_state(ctx->state);
|
| 3379 |
|