slaren commited on
Commit
5cffd6f
·
unverified ·
1 Parent(s): 469988b

ggml-alloc : v3 (ggml/727)

Browse files

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

Files changed (7) hide show
  1. ggml-alloc.c +563 -490
  2. ggml-alloc.h +39 -65
  3. ggml-backend.c +225 -258
  4. ggml-backend.h +5 -10
  5. ggml.c +19 -9
  6. ggml.h +15 -3
  7. whisper.cpp +175 -225
ggml-alloc.c CHANGED
@@ -17,6 +17,50 @@
17
  //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
  #define AT_PRINTF(...)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  // TODO: GGML_PAD ?
21
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
22
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
24
  return offset + align;
25
  }
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  struct free_block {
28
- void * addr;
29
  size_t size;
30
  };
31
 
32
- struct ggml_tallocr {
33
- struct ggml_backend_buffer * buffer;
34
- bool buffer_owned;
35
- void * base;
36
  size_t alignment;
37
-
38
  int n_free_blocks;
39
  struct free_block free_blocks[MAX_FREE_BLOCKS];
40
-
41
  size_t max_size;
42
 
43
- bool measure;
44
-
45
  #ifdef GGML_ALLOCATOR_DEBUG
46
- struct ggml_tensor * allocated_tensors[1024];
 
 
 
47
  #endif
48
  };
49
 
50
  #ifdef GGML_ALLOCATOR_DEBUG
51
- static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
52
  for (int i = 0; i < 1024; i++) {
53
- if (alloc->allocated_tensors[i] == NULL) {
54
- alloc->allocated_tensors[i] = tensor;
 
55
  return;
56
  }
57
  }
58
  GGML_ASSERT(!"out of allocated_tensors");
59
  }
60
- static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
61
  for (int i = 0; i < 1024; i++) {
62
- if (alloc->allocated_tensors[i] == tensor ||
63
- (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
64
- alloc->allocated_tensors[i] = NULL;
65
  return;
66
  }
67
  }
68
- printf("tried to free tensor %s not found\n", tensor->name);
69
  GGML_ASSERT(!"tensor not found");
70
  }
71
  #endif
72
 
73
- // check if a tensor is allocated by this buffer
74
- static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
75
- return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
76
- }
77
-
78
- static bool ggml_is_view(struct ggml_tensor * t) {
79
- return t->view_src != NULL;
80
- }
81
-
82
- void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
83
- GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
84
- GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
85
-
86
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
87
  size = aligned_offset(NULL, size, alloc->alignment);
88
 
89
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
109
  if (block->size >= size) {
110
  best_fit_block = alloc->n_free_blocks - 1;
111
  } else {
112
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
113
- __func__, tensor->name, size, max_avail);
 
114
  GGML_ASSERT(!"not enough space in the buffer");
115
- return;
116
  }
117
  }
118
 
119
  struct free_block * block = &alloc->free_blocks[best_fit_block];
120
- void * addr = block->addr;
121
- block->addr = (char*)block->addr + size;
122
  block->size -= size;
123
  if (block->size == 0) {
124
  // remove block if empty
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
128
  }
129
  }
130
 
131
- AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
132
-
133
- tensor->data = addr;
134
- tensor->buffer = alloc->buffer;
135
- if (!alloc->measure) {
136
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
137
- }
138
 
139
  #ifdef GGML_ALLOCATOR_DEBUG
140
- add_allocated_tensor(alloc, tensor);
141
- size_t cur_max = (char*)addr - (char*)alloc->base + size;
142
  if (cur_max > alloc->max_size) {
143
- printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  for (int i = 0; i < 1024; i++) {
145
- if (alloc->allocated_tensors[i]) {
146
- printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
 
 
 
147
  }
148
  }
149
- printf("\n");
150
  }
151
  #endif
152
 
153
- alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
154
- }
155
 
156
- // this is a very naive implementation, but for our case the number of free blocks should be very small
157
- static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
158
- if (ggml_tallocr_is_own(alloc, tensor) == false) {
159
- // the tensor was not allocated in this buffer
160
- // this can happen because the graph allocator will try to free weights and other tensors from different buffers
161
- // the easiest way to deal with this is just to ignore it
162
- // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
163
- return;
164
- }
165
 
166
- void * ptr = tensor->data;
 
167
 
168
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
 
169
  size = aligned_offset(NULL, size, alloc->alignment);
170
- AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
 
171
 
172
  #ifdef GGML_ALLOCATOR_DEBUG
173
- remove_allocated_tensor(alloc, tensor);
174
  #endif
175
 
176
  // see if we can merge with an existing block
177
  for (int i = 0; i < alloc->n_free_blocks; i++) {
178
  struct free_block * block = &alloc->free_blocks[i];
179
  // check if ptr is at the end of the block
180
- if ((char*)block->addr + block->size == ptr) {
181
  block->size += size;
182
  // check if we can merge with the next block
183
- if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
184
  block->size += alloc->free_blocks[i+1].size;
185
  alloc->n_free_blocks--;
186
  for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
190
  return;
191
  }
192
  // check if ptr is at the beginning of the block
193
- if ((char*)ptr + size == block->addr) {
194
- block->addr = ptr;
195
  block->size += size;
196
  // check if we can merge with the previous block
197
- if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
198
  alloc->free_blocks[i-1].size += block->size;
199
  alloc->n_free_blocks--;
200
  for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
208
  GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
209
  // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
210
  int insert_pos = 0;
211
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
212
  insert_pos++;
213
  }
214
  // shift all blocks from insert_pos onward to make room for the new block
@@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
216
  alloc->free_blocks[i] = alloc->free_blocks[i-1];
217
  }
218
  // insert the new block
219
- alloc->free_blocks[insert_pos].addr = ptr;
220
  alloc->free_blocks[insert_pos].size = size;
221
  alloc->n_free_blocks++;
 
 
222
  }
223
 
224
- void ggml_tallocr_reset(ggml_tallocr_t alloc) {
225
  alloc->n_free_blocks = 1;
226
- size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
227
- alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
228
-
229
- if (alloc->measure) {
230
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
231
- } else {
232
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
233
- ggml_backend_buffer_reset(alloc->buffer);
234
- }
235
  }
236
 
237
- ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
238
- struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
239
-
240
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
241
 
242
- *alloc = (struct ggml_tallocr) {
243
- /*.buffer = */ buffer,
244
- /*.buffer_owned = */ true,
245
- /*.base = */ ggml_backend_buffer_get_base(buffer),
246
  /*.alignment = */ alignment,
247
  /*.n_free_blocks = */ 0,
248
  /*.free_blocks = */ {{0}},
249
  /*.max_size = */ 0,
250
- /*.measure = */ false,
251
  #ifdef GGML_ALLOCATOR_DEBUG
252
- /*.allocated_tensors = */ {0},
253
  #endif
254
  };
255
 
256
- ggml_tallocr_reset(alloc);
257
-
258
- return alloc;
259
- }
260
-
261
- ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
262
- ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
263
- alloc->measure = true;
264
 
265
  return alloc;
266
  }
267
 
268
- ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
269
- // create a backend buffer to get the correct tensor allocation sizes
270
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
271
-
272
- // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
273
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
274
- alloc->buffer_owned = true;
275
- alloc->measure = true;
276
- ggml_tallocr_reset(alloc);
277
- return alloc;
278
- }
279
-
280
- ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281
- return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
282
- }
283
-
284
- ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285
- // create a backend buffer to get the correct tensor allocation sizes
286
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
287
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
288
- alloc->buffer_owned = true;
289
- return alloc;
290
- }
291
-
292
- ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293
- return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294
- }
295
-
296
- ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
297
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
298
-
299
- *alloc = (struct ggml_tallocr) {
300
- /*.buffer = */ buffer,
301
- /*.buffer_owned = */ false,
302
- /*.base = */ ggml_backend_buffer_get_base(buffer),
303
- /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
304
- /*.n_free_blocks = */ 0,
305
- /*.free_blocks = */ {{0}},
306
- /*.max_size = */ 0,
307
- /*.measure = */ false,
308
- #ifdef GGML_ALLOCATOR_DEBUG
309
- /*.allocated_tensors = */ {0},
310
- #endif
311
- };
312
-
313
- ggml_tallocr_reset(alloc);
314
-
315
- return alloc;
316
- }
317
-
318
- struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
319
- return alloc->buffer;
320
- }
321
-
322
- void ggml_tallocr_free(ggml_tallocr_t alloc) {
323
- if (alloc == NULL) {
324
- return;
325
- }
326
-
327
- if (alloc->buffer_owned) {
328
- ggml_backend_buffer_free(alloc->buffer);
329
- }
330
  free(alloc);
331
  }
332
 
333
- bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
334
- return alloc->measure;
335
  }
336
 
337
- size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
- // to avoid this, we add a 10% margin to the buffer size
340
- return alloc->max_size + alloc->max_size/10;
341
- }
342
 
343
  // graph allocator
344
 
345
  struct hash_node {
346
  int n_children;
347
  int n_views;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  };
349
 
350
  struct ggml_gallocr {
351
- ggml_tallocr_t talloc;
 
 
 
 
352
  struct ggml_hash_set hash_set;
353
- struct hash_node * hash_values;
354
- size_t hash_values_size;
355
- ggml_tallocr_t * hash_allocs;
356
- int * parse_seq;
357
- int parse_seq_len;
358
  };
359
 
360
- ggml_gallocr_t ggml_gallocr_new(void) {
361
- ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
362
-
363
- *galloc = (struct ggml_gallocr) {
364
- /*.talloc = */ NULL,
365
- /*.hash_set = */ {0},
366
- /*.hash_values = */ NULL,
367
- /*.hash_values_size = */ 0,
368
- /*.hash_allocs = */ NULL,
369
- /*.parse_seq = */ NULL,
370
- /*.parse_seq_len = */ 0,
371
- };
 
 
 
 
 
 
 
 
372
 
373
  return galloc;
374
  }
375
 
 
 
 
 
376
  void ggml_gallocr_free(ggml_gallocr_t galloc) {
377
  if (galloc == NULL) {
378
  return;
379
  }
380
 
381
- if (galloc->hash_set.keys != NULL) {
382
- free(galloc->hash_set.keys);
383
- }
384
- if (galloc->hash_values != NULL) {
385
- free(galloc->hash_values);
386
- }
387
- if (galloc->hash_allocs != NULL) {
388
- free(galloc->hash_allocs);
389
- }
390
- if (galloc->parse_seq != NULL) {
391
- free(galloc->parse_seq);
392
  }
 
 
 
 
 
 
 
393
  free(galloc);
394
  }
395
 
396
- void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
397
- free(galloc->parse_seq);
398
- galloc->parse_seq = malloc(sizeof(int) * n);
399
 
400
- for (int i = 0; i < n; i++) {
401
- galloc->parse_seq[i] = list[i];
402
- }
403
- galloc->parse_seq_len = n;
404
- }
405
-
406
- static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
407
  size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
408
  return &galloc->hash_values[i];
409
  }
410
 
411
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
412
- if (a->type != b->type) {
413
- return false;
414
- }
415
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
416
- if (a->ne[i] != b->ne[i]) {
417
- return false;
418
- }
419
- if (a->nb[i] != b->nb[i]) {
420
- return false;
421
- }
422
- }
423
- return true;
424
  }
425
 
426
- static bool ggml_op_can_inplace(enum ggml_op op) {
427
- switch (op) {
428
- case GGML_OP_SCALE:
429
- case GGML_OP_DIAG_MASK_ZERO:
430
- case GGML_OP_DIAG_MASK_INF:
431
- case GGML_OP_ADD:
432
- case GGML_OP_ADD1:
433
- case GGML_OP_SUB:
434
- case GGML_OP_MUL:
435
- case GGML_OP_DIV:
436
- case GGML_OP_SQR:
437
- case GGML_OP_SQRT:
438
- case GGML_OP_LOG:
439
- case GGML_OP_UNARY:
440
- case GGML_OP_ROPE:
441
- case GGML_OP_RMS_NORM:
442
- case GGML_OP_SOFT_MAX:
443
- return true;
444
-
445
- default:
446
- return false;
447
- }
448
  }
449
 
450
- static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
451
- if (galloc->talloc != NULL) {
452
- return galloc->talloc;
453
- }
454
-
455
- return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
456
  }
457
 
458
- static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
459
- ggml_tallocr_t alloc = node_tallocr(galloc, view);
460
-
461
- GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
462
- if (update_backend) {
463
- view->backend = view->view_src->backend;
464
- }
465
- // views are initialized in the alloc buffer rather than the view_src buffer
466
- view->buffer = alloc->buffer;
467
- view->data = (char *)view->view_src->data + view->view_offs;
468
 
469
- assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
 
 
470
 
471
- if (!alloc->measure) {
472
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
473
- }
474
- }
 
 
 
475
 
476
- static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
477
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
 
 
 
478
 
479
- if (node->data == NULL) {
480
- if (ggml_is_view(node)) {
481
- init_view(galloc, node, true);
482
- } else {
483
- // see if we can reuse a parent's buffer (inplace)
484
- if (ggml_op_can_inplace(node->op)) {
485
- for (int i = 0; i < GGML_MAX_SRC; i++) {
486
- struct ggml_tensor * parent = node->src[i];
487
- if (parent == NULL) {
488
- break;
489
- }
490
 
491
- // if the node's data is external, then we cannot re-use it
492
- if (ggml_tallocr_is_own(alloc, parent) == false) {
493
- AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
494
- continue;
495
- }
496
 
497
- struct hash_node * p_hn = hash_get(galloc, parent);
498
- if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
499
- if (ggml_is_view(parent)) {
500
- struct ggml_tensor * view_src = parent->view_src;
501
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
502
- if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
503
- // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
504
- // the parent's data that it will need later (same layout requirement). the problem is that then
505
- // we cannot free the tensor because the original address of the allocation is lost.
506
- // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
507
- // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
508
- AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
509
- node->view_src = view_src;
510
- view_src_hn->n_views += 1;
511
- init_view(galloc, node, false);
512
- return;
513
- }
514
- } else {
515
- AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
516
- node->view_src = parent;
517
- p_hn->n_views += 1;
518
- init_view(galloc, node, false);
519
  return;
520
  }
 
 
 
 
 
 
521
  }
522
  }
523
  }
524
- ggml_tallocr_alloc(alloc, node);
525
  }
 
 
 
 
 
 
 
 
526
  }
527
  }
528
 
529
- static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
530
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
 
 
 
 
531
 
532
- ggml_tallocr_free_tensor(alloc, node);
 
 
 
 
 
 
533
  }
534
 
535
- static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
536
- const int * parse_seq = galloc->parse_seq;
537
- int parse_seq_len = galloc->parse_seq_len;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
 
539
  // count number of children and views
540
- for (int i = 0; i < gf->n_nodes; i++) {
541
- struct ggml_tensor * node = gf->nodes[i];
542
 
543
  if (ggml_is_view(node)) {
544
  struct ggml_tensor * view_src = node->view_src;
545
- hash_get(galloc, view_src)->n_views += 1;
546
- if (node->buffer == NULL && node->data != NULL) {
547
- // view of a pre-allocated tensor, didn't call init_view() yet
548
- init_view(galloc, node, true);
549
- }
550
  }
551
 
552
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
554
  if (parent == NULL) {
555
  break;
556
  }
557
- hash_get(galloc, parent)->n_children += 1;
558
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
559
- init_view(galloc, parent, true);
560
- }
561
  }
562
  }
563
 
564
  // allocate tensors
565
- // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
566
- int last_barrier_pos = 0;
567
- int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
568
-
569
- for (int ind = 0; ind < n_nodes; ind++) {
570
- // allocate a node if there is no parse_seq or this is not a barrier
571
- if (parse_seq_len == 0 || parse_seq[ind] != -1) {
572
- int i = parse_seq_len ? parse_seq[ind] : ind;
573
- struct ggml_tensor * node = gf->nodes[i];
574
-
575
- // allocate parents (leafs)
576
- for (int j = 0; j < GGML_MAX_SRC; j++) {
577
- struct ggml_tensor * parent = node->src[j];
578
- if (parent == NULL) {
579
- break;
580
- }
581
- allocate_node(galloc, parent);
582
  }
 
 
583
 
584
- // allocate node
585
- allocate_node(galloc, node);
586
 
587
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
588
- for (int j = 0; j < GGML_MAX_SRC; j++) {
589
- struct ggml_tensor * parent = node->src[j];
590
- if (parent == NULL) {
591
- break;
592
- }
593
- AT_PRINTF("%s", parent->name);
594
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
595
- AT_PRINTF(", ");
596
- }
597
  }
598
- AT_PRINTF("\n");
599
  }
 
600
 
601
  // update parents
602
- // update immediately if there is no parse_seq
603
- // update only at barriers if there is parse_seq
604
- if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
605
- int update_start = parse_seq_len ? last_barrier_pos : ind;
606
- int update_end = parse_seq_len ? ind : ind + 1;
607
- for (int i = update_start; i < update_end; i++) {
608
- int node_i = parse_seq_len ? parse_seq[i] : i;
609
- struct ggml_tensor * node = gf->nodes[node_i];
610
-
611
- for (int j = 0; j < GGML_MAX_SRC; j++) {
612
- struct ggml_tensor * parent = node->src[j];
613
- if (parent == NULL) {
614
- break;
615
- }
616
- struct hash_node * p_hn = hash_get(galloc, parent);
617
- p_hn->n_children -= 1;
618
-
619
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
620
-
621
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
622
- if (ggml_is_view(parent)) {
623
- struct ggml_tensor * view_src = parent->view_src;
624
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
625
- view_src_hn->n_views -= 1;
626
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
627
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
628
- free_node(galloc, view_src);
629
- }
630
- }
631
- else {
632
- free_node(galloc, parent);
633
- }
634
  }
635
  }
 
 
 
636
  }
637
  AT_PRINTF("\n");
638
- if (parse_seq_len) {
639
- last_barrier_pos = ind + 1;
640
- }
641
  }
642
  }
643
  }
644
 
645
- size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
646
  size_t hash_size = graph->visited_hash_table.size;
647
 
648
- // check if the hash table is initialized and large enough
649
  if (galloc->hash_set.size < hash_size) {
650
- if (galloc->hash_set.keys != NULL) {
651
- free(galloc->hash_set.keys);
652
- }
653
- if (galloc->hash_values != NULL) {
654
- free(galloc->hash_values);
655
- }
656
- galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
657
  galloc->hash_set.size = hash_size;
658
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
 
 
 
 
 
 
 
659
  }
660
 
661
- // reset hash table
662
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
663
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
664
-
665
- galloc->talloc = talloc;
666
- ggml_tallocr_alloc_graph_impl(galloc, graph);
667
- galloc->talloc = NULL;
668
-
669
- size_t max_size = ggml_tallocr_max_size(talloc);
670
-
671
- return max_size;
672
- }
673
-
674
- void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
675
- const size_t hash_size = hash_set.size;
676
-
677
- GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
678
 
679
- galloc->talloc = NULL;
 
680
 
681
- // alloc hash_values if needed
682
- if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
683
- free(galloc->hash_values);
684
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
685
- galloc->hash_values_size = hash_size;
686
  }
687
-
688
- // free hash_set.keys if needed
689
- if (galloc->hash_set.keys != NULL) {
690
- free(galloc->hash_set.keys);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  }
692
- galloc->hash_set = hash_set;
693
 
694
- // reset hash values
695
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
 
 
696
 
697
- galloc->hash_allocs = hash_node_talloc;
698
-
699
- ggml_tallocr_alloc_graph_impl(galloc, graph);
 
 
 
 
 
 
 
 
 
700
 
701
- // remove unowned resources
702
- galloc->hash_set.keys = NULL;
703
- galloc->hash_allocs = NULL;
704
  }
705
 
706
- // legacy API wrapper
707
-
708
- struct ggml_allocr {
709
- ggml_tallocr_t talloc;
710
- ggml_gallocr_t galloc;
711
- };
712
-
713
- static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
714
- ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
715
- *alloc = (struct ggml_allocr) {
716
- /*.talloc = */ talloc,
717
- /*.galloc = */ ggml_gallocr_new(),
718
- };
719
- return alloc;
720
  }
721
 
722
- ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
723
- return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
724
- }
725
 
726
- ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
727
- return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
728
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
 
730
- ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
731
- return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
 
 
 
 
 
 
 
 
732
  }
733
 
734
- ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
735
- return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
 
 
736
  }
737
 
738
- ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
739
- return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
740
- }
 
 
 
 
741
 
742
- struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
743
- return ggml_tallocr_get_buffer(alloc->talloc);
744
- }
745
 
746
- void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
747
- ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
748
- }
 
 
 
749
 
750
- void ggml_allocr_free(ggml_allocr_t alloc) {
751
- if (alloc == NULL) {
752
- return;
 
 
 
 
 
 
 
 
 
753
  }
754
 
755
- ggml_gallocr_free(alloc->galloc);
756
- ggml_tallocr_free(alloc->talloc);
757
- free(alloc);
758
  }
759
 
760
- bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
761
- return ggml_tallocr_is_measure(alloc->talloc);
762
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
763
 
764
- void ggml_allocr_reset(ggml_allocr_t alloc) {
765
- ggml_tallocr_reset(alloc->talloc);
766
- }
 
 
 
 
767
 
768
- void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
769
- ggml_tallocr_alloc(alloc->talloc, tensor);
770
- }
 
 
 
 
 
 
 
 
 
 
771
 
772
- size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
773
- return ggml_tallocr_max_size(alloc->talloc);
774
  }
775
 
776
- size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
777
- return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
 
 
 
 
 
778
  }
779
 
780
  // utils
@@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
795
  return false;
796
  }
797
 
798
- ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
799
 
800
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
801
  if (t->data == NULL) {
802
  if (t->view_src == NULL) {
803
  ggml_tallocr_alloc(tallocr, t);
804
- } else {
805
  ggml_backend_view_init(buffer, t);
806
  }
807
  } else {
808
- if (t->view_src != NULL) {
809
  // view of a pre-allocated tensor
810
  ggml_backend_view_init(buffer, t);
811
  }
@@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
838
  }
839
 
840
  if (this_size > max_size) {
841
- // tensor is too large to fit in a single buffer
842
  fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
  __func__, t->name,
844
  ggml_backend_buft_name(buft),
@@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
870
  }
871
 
872
  if (n_buffers == 0) {
873
- // all the tensors in the context are already allocated
874
  #ifndef NDEBUG
875
  fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
  #endif
 
17
  //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
  #define AT_PRINTF(...)
19
 
20
+
21
+ static bool ggml_is_view(const struct ggml_tensor * t) {
22
+ return t->view_src != NULL;
23
+ }
24
+
25
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
+ if (a->type != b->type) {
27
+ return false;
28
+ }
29
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
+ if (a->ne[i] != b->ne[i]) {
31
+ return false;
32
+ }
33
+ if (a->nb[i] != b->nb[i]) {
34
+ return false;
35
+ }
36
+ }
37
+ return true;
38
+ }
39
+
40
+ static bool ggml_op_can_inplace(enum ggml_op op) {
41
+ switch (op) {
42
+ case GGML_OP_SCALE:
43
+ case GGML_OP_DIAG_MASK_ZERO:
44
+ case GGML_OP_DIAG_MASK_INF:
45
+ case GGML_OP_ADD:
46
+ case GGML_OP_ADD1:
47
+ case GGML_OP_SUB:
48
+ case GGML_OP_MUL:
49
+ case GGML_OP_DIV:
50
+ case GGML_OP_SQR:
51
+ case GGML_OP_SQRT:
52
+ case GGML_OP_LOG:
53
+ case GGML_OP_UNARY:
54
+ case GGML_OP_ROPE:
55
+ case GGML_OP_RMS_NORM:
56
+ case GGML_OP_SOFT_MAX:
57
+ return true;
58
+
59
+ default:
60
+ return false;
61
+ }
62
+ }
63
+
64
  // TODO: GGML_PAD ?
65
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
66
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
 
68
  return offset + align;
69
  }
70
 
71
+ // tallocr
72
+ struct ggml_tallocr {
73
+ ggml_backend_buffer_t buffer;
74
+ void * base;
75
+ size_t alignment;
76
+ size_t offset;
77
+ };
78
+
79
+ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
+ ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
+ if (talloc == NULL) {
82
+ return NULL;
83
+ }
84
+
85
+ void * base = ggml_backend_buffer_get_base(buffer);
86
+ size_t align = ggml_backend_buffer_get_alignment(buffer);
87
+
88
+ assert(align && !(align & (align - 1))); // power of 2
89
+
90
+ *talloc = (struct ggml_tallocr) {
91
+ /*.buffer = */ buffer,
92
+ /*.base = */ base,
93
+ /*.alignment = */ align,
94
+ /*.offset = */ aligned_offset(base, 0, align),
95
+ };
96
+ return talloc;
97
+ }
98
+
99
+ void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
+ free(talloc);
101
+ }
102
+
103
+ void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
104
+ size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
+ size = GGML_PAD(size, talloc->alignment);
106
+
107
+ if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
108
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
109
+ __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
110
+ GGML_ASSERT(!"not enough space in the buffer");
111
+ return;
112
+ }
113
+
114
+ void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
115
+ talloc->offset += size;
116
+
117
+ assert(((uintptr_t)addr % talloc->alignment) == 0);
118
+
119
+ ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
120
+ }
121
+
122
+ // dynamic tensor allocator
123
+
124
  struct free_block {
125
+ size_t offset;
126
  size_t size;
127
  };
128
 
129
+ struct ggml_dyn_tallocr {
 
 
 
130
  size_t alignment;
 
131
  int n_free_blocks;
132
  struct free_block free_blocks[MAX_FREE_BLOCKS];
 
133
  size_t max_size;
134
 
 
 
135
  #ifdef GGML_ALLOCATOR_DEBUG
136
+ struct {
137
+ const struct ggml_tensor * tensor;
138
+ size_t offset;
139
+ } allocated_tensors[1024];
140
  #endif
141
  };
142
 
143
  #ifdef GGML_ALLOCATOR_DEBUG
144
+ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
145
  for (int i = 0; i < 1024; i++) {
146
+ if (alloc->allocated_tensors[i].tensor == NULL) {
147
+ alloc->allocated_tensors[i].tensor = tensor;
148
+ alloc->allocated_tensors[i].offset = offset;
149
  return;
150
  }
151
  }
152
  GGML_ASSERT(!"out of allocated_tensors");
153
  }
154
+ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
155
  for (int i = 0; i < 1024; i++) {
156
+ if (alloc->allocated_tensors[i].offset == offset) {
157
+ alloc->allocated_tensors[i].tensor = NULL;
 
158
  return;
159
  }
160
  }
161
+ fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
162
  GGML_ASSERT(!"tensor not found");
163
  }
164
  #endif
165
 
166
+ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  size = aligned_offset(NULL, size, alloc->alignment);
168
 
169
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 
189
  if (block->size >= size) {
190
  best_fit_block = alloc->n_free_blocks - 1;
191
  } else {
192
+ // this should never happen
193
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
194
+ __func__, size, max_avail);
195
  GGML_ASSERT(!"not enough space in the buffer");
196
+ GGML_UNREACHABLE();
197
  }
198
  }
199
 
200
  struct free_block * block = &alloc->free_blocks[best_fit_block];
201
+ size_t offset = block->offset;
202
+ block->offset = offset + size;
203
  block->size -= size;
204
  if (block->size == 0) {
205
  // remove block if empty
 
209
  }
210
  }
211
 
212
+ AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
 
 
 
 
 
 
213
 
214
  #ifdef GGML_ALLOCATOR_DEBUG
215
+ add_allocated_tensor(alloc, offset, tensor);
216
+ size_t cur_max = offset + size;
217
  if (cur_max > alloc->max_size) {
218
+ // sort allocated_tensors by offset
219
+ for (int i = 0; i < 1024; i++) {
220
+ for (int j = i + 1; j < 1024; j++) {
221
+ if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
222
+ const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
223
+ size_t tmp_offset = alloc->allocated_tensors[i].offset;
224
+ alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
225
+ alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
226
+ alloc->allocated_tensors[j].tensor = tmp_tensor;
227
+ alloc->allocated_tensors[j].offset = tmp_offset;
228
+ }
229
+ }
230
+ }
231
+ fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
232
  for (int i = 0; i < 1024; i++) {
233
+ if (alloc->allocated_tensors[i].tensor) {
234
+ fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
235
+ alloc->allocated_tensors[i].offset,
236
+ alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
237
+ ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
238
  }
239
  }
240
+ fprintf(stderr, "\n");
241
  }
242
  #endif
243
 
244
+ alloc->max_size = MAX(alloc->max_size, offset + size);
 
245
 
246
+ return offset;
 
 
 
 
 
 
 
 
247
 
248
+ GGML_UNUSED(tensor);
249
+ }
250
 
251
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
252
+ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
253
  size = aligned_offset(NULL, size, alloc->alignment);
254
+
255
+ AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
256
 
257
  #ifdef GGML_ALLOCATOR_DEBUG
258
+ remove_allocated_tensor(alloc, offset, tensor);
259
  #endif
260
 
261
  // see if we can merge with an existing block
262
  for (int i = 0; i < alloc->n_free_blocks; i++) {
263
  struct free_block * block = &alloc->free_blocks[i];
264
  // check if ptr is at the end of the block
265
+ if (block->offset + block->size == offset) {
266
  block->size += size;
267
  // check if we can merge with the next block
268
+ if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
269
  block->size += alloc->free_blocks[i+1].size;
270
  alloc->n_free_blocks--;
271
  for (int j = i+1; j < alloc->n_free_blocks; j++) {
 
275
  return;
276
  }
277
  // check if ptr is at the beginning of the block
278
+ if (offset + size == block->offset) {
279
+ block->offset = offset;
280
  block->size += size;
281
  // check if we can merge with the previous block
282
+ if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
283
  alloc->free_blocks[i-1].size += block->size;
284
  alloc->n_free_blocks--;
285
  for (int j = i; j < alloc->n_free_blocks; j++) {
 
293
  GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
294
  // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
295
  int insert_pos = 0;
296
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
297
  insert_pos++;
298
  }
299
  // shift all blocks from insert_pos onward to make room for the new block
 
301
  alloc->free_blocks[i] = alloc->free_blocks[i-1];
302
  }
303
  // insert the new block
304
+ alloc->free_blocks[insert_pos].offset = offset;
305
  alloc->free_blocks[insert_pos].size = size;
306
  alloc->n_free_blocks++;
307
+
308
+ GGML_UNUSED(tensor);
309
  }
310
 
311
+ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
312
  alloc->n_free_blocks = 1;
313
+ alloc->free_blocks[0].offset = 0;
314
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
315
+ alloc->max_size = 0;
 
 
 
 
 
 
316
  }
317
 
318
+ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
319
+ struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
 
 
320
 
321
+ *alloc = (struct ggml_dyn_tallocr) {
 
 
 
322
  /*.alignment = */ alignment,
323
  /*.n_free_blocks = */ 0,
324
  /*.free_blocks = */ {{0}},
325
  /*.max_size = */ 0,
 
326
  #ifdef GGML_ALLOCATOR_DEBUG
327
+ /*.allocated_tensors = */ {{0}},
328
  #endif
329
  };
330
 
331
+ ggml_dyn_tallocr_reset(alloc);
 
 
 
 
 
 
 
332
 
333
  return alloc;
334
  }
335
 
336
+ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  free(alloc);
338
  }
339
 
340
+ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
341
+ return alloc->max_size;
342
  }
343
 
344
+
345
+ /////////////////////////////////////
 
 
 
346
 
347
  // graph allocator
348
 
349
  struct hash_node {
350
  int n_children;
351
  int n_views;
352
+ int buffer_id;
353
+ size_t offset; // offset within the buffer
354
+ bool allocated;
355
+ };
356
+
357
+ //
358
+ struct tensor_alloc {
359
+ size_t offset;
360
+ size_t size_max; // 0 = pre-allocated, unused, or view
361
+ };
362
+
363
+ struct node_alloc {
364
+ int buffer_id;
365
+ struct tensor_alloc dst;
366
+ struct tensor_alloc src[GGML_MAX_SRC];
367
  };
368
 
369
  struct ggml_gallocr {
370
+ ggml_backend_buffer_type_t * bufts; // [n_buffers]
371
+ ggml_backend_buffer_t * buffers; // [n_buffers]
372
+ struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
373
+ int n_buffers;
374
+
375
  struct ggml_hash_set hash_set;
376
+ struct hash_node * hash_values; // [hash_set.size]
377
+
378
+ struct node_alloc * node_allocs; // [n_nodes]
379
+ int n_nodes;
 
380
  };
381
 
382
+ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
383
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
384
+ GGML_ASSERT(galloc != NULL);
385
+
386
+ galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
387
+ GGML_ASSERT(galloc->bufts != NULL);
388
+
389
+ galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
390
+ GGML_ASSERT(galloc->buffers != NULL);
391
+
392
+ galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
393
+ GGML_ASSERT(galloc->buf_tallocs != NULL);
394
+
395
+ for (int i = 0; i < n_bufs; i++) {
396
+ galloc->bufts[i] = bufts[i];
397
+ galloc->buffers[i] = NULL;
398
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
399
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
400
+ }
401
+ galloc->n_buffers = n_bufs;
402
 
403
  return galloc;
404
  }
405
 
406
+ ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
407
+ return ggml_gallocr_new_n(&buft, 1);
408
+ }
409
+
410
  void ggml_gallocr_free(ggml_gallocr_t galloc) {
411
  if (galloc == NULL) {
412
  return;
413
  }
414
 
415
+ for (int i = 0; i < galloc->n_buffers; i++) {
416
+ if (galloc->buffers != NULL) {
417
+ ggml_backend_buffer_free(galloc->buffers[i]);
418
+ }
419
+ if (galloc->buf_tallocs != NULL) {
420
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
421
+ }
 
 
 
 
422
  }
423
+
424
+ free(galloc->hash_set.keys);
425
+ free(galloc->hash_values);
426
+ free(galloc->bufts);
427
+ free(galloc->buffers);
428
+ free(galloc->buf_tallocs);
429
+ free(galloc->node_allocs);
430
  free(galloc);
431
  }
432
 
433
+ typedef struct ggml_gallocr * ggml_gallocr_t;
 
 
434
 
435
+ static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 
 
 
 
 
 
436
  size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
437
  return &galloc->hash_values[i];
438
  }
439
 
440
+ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
441
+ return ggml_gallocr_hash_get(galloc, t)->allocated;
 
 
 
 
 
 
 
 
 
 
 
442
  }
443
 
444
+ static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
445
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
446
+ hn->buffer_id = buffer_id;
447
+ hn->offset = offset;
448
+ hn->allocated = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  }
450
 
451
+ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
452
+ return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 
 
 
 
453
  }
454
 
455
+ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
456
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 
 
 
 
 
 
 
 
457
 
458
+ if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
459
+ hn->allocated = true;
460
+ assert(hn->offset == 0);
461
 
462
+ // try to reuse a parent's buffer (inplace)
463
+ if (ggml_op_can_inplace(node->op)) {
464
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
465
+ struct ggml_tensor * parent = node->src[i];
466
+ if (parent == NULL) {
467
+ break;
468
+ }
469
 
470
+ // if the node's data is external, then we cannot re-use it
471
+ if (!ggml_gallocr_is_own(galloc, parent)) {
472
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
473
+ continue;
474
+ }
475
 
476
+ // outputs cannot be reused
477
+ if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
478
+ AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
479
+ continue;
480
+ }
 
 
 
 
 
 
481
 
482
+ if (!ggml_are_same_layout(node, parent)) {
483
+ AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
484
+ continue;
485
+ }
 
486
 
487
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
488
+ if (p_hn->n_children == 1 && p_hn->n_views == 0) {
489
+ if (ggml_is_view(parent)) {
490
+ struct ggml_tensor * view_src = parent->view_src;
491
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
492
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
493
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
494
+ assert(view_src_hn->offset == p_hn->offset);
495
+ hn->buffer_id = p_hn->buffer_id;
496
+ hn->offset = p_hn->offset;
497
+ p_hn->allocated = false; // avoid freeing the parent
498
+ view_src_hn->allocated = false;
 
 
 
 
 
 
 
 
 
 
499
  return;
500
  }
501
+ } else {
502
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
503
+ hn->buffer_id = p_hn->buffer_id;
504
+ hn->offset = p_hn->offset;
505
+ p_hn->allocated = false; // avoid freeing the parent
506
+ return;
507
  }
508
  }
509
  }
 
510
  }
511
+ // allocate tensor from the buffer
512
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
513
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
514
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
515
+ size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
516
+ hn->buffer_id = buffer_id;
517
+ hn->offset = offset;
518
+ return;
519
  }
520
  }
521
 
522
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
523
+ // graph outputs are never freed
524
+ if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
525
+ AT_PRINTF("not freeing output %s\n", node->name);
526
+ return;
527
+ }
528
 
529
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
530
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
531
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
532
+ size_t offset = hn->offset;
533
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
534
+ ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
535
+ hn->allocated = false;
536
  }
537
 
538
+ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
539
+ return node_buffer_ids ? node_buffer_ids[i] : 0;
540
+ }
541
+
542
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
543
+ // clear hash tables
544
+ memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
545
+ memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
546
+
547
+ // allocate all graph inputs first to avoid overwriting them
548
+ for (int i = 0; i < graph->n_nodes; i++) {
549
+ if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
550
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
551
+ }
552
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
553
+ if (graph->nodes[i]->src[j] == NULL) {
554
+ break;
555
+ }
556
+ if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
557
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
558
+ }
559
+ }
560
+ }
561
 
562
  // count number of children and views
563
+ for (int i = 0; i < graph->n_nodes; i++) {
564
+ struct ggml_tensor * node = graph->nodes[i];
565
 
566
  if (ggml_is_view(node)) {
567
  struct ggml_tensor * view_src = node->view_src;
568
+ ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
 
 
 
 
569
  }
570
 
571
  for (int j = 0; j < GGML_MAX_SRC; j++) {
 
573
  if (parent == NULL) {
574
  break;
575
  }
576
+ ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
 
 
 
577
  }
578
  }
579
 
580
  // allocate tensors
581
+ for (int i = 0; i < graph->n_nodes; i++) {
582
+ struct ggml_tensor * node = graph->nodes[i];
583
+ int buffer_id = get_node_buffer_id(node_buffer_ids, i);
584
+
585
+ // allocate parents (only leafs need to be allocated at this point)
586
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
587
+ struct ggml_tensor * parent = node->src[j];
588
+ if (parent == NULL) {
589
+ break;
 
 
 
 
 
 
 
 
590
  }
591
+ ggml_gallocr_allocate_node(galloc, parent, buffer_id);
592
+ }
593
 
594
+ // allocate node
595
+ ggml_gallocr_allocate_node(galloc, node, buffer_id);
596
 
597
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
598
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
599
+ struct ggml_tensor * parent = node->src[j];
600
+ if (parent == NULL) {
601
+ break;
602
+ }
603
+ AT_PRINTF("%s", parent->name);
604
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
605
+ AT_PRINTF(", ");
 
606
  }
 
607
  }
608
+ AT_PRINTF("\n");
609
 
610
  // update parents
611
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
612
+ struct ggml_tensor * parent = node->src[j];
613
+ if (parent == NULL) {
614
+ break;
615
+ }
616
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
617
+ p_hn->n_children -= 1;
618
+
619
+ AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
620
+ parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
621
+
622
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
623
+ if (ggml_is_view(parent)) {
624
+ struct ggml_tensor * view_src = parent->view_src;
625
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
626
+ view_src_hn->n_views -= 1;
627
+ AT_PRINTF("view_src %s: %d children, %d views\n",
628
+ view_src->name, view_src_hn->n_children, view_src_hn->n_views);
629
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
630
+ ggml_gallocr_free_node(galloc, view_src, buffer_id);
 
 
 
 
 
 
 
 
 
 
 
 
631
  }
632
  }
633
+ else if (p_hn->allocated) {
634
+ ggml_gallocr_free_node(galloc, parent, buffer_id);
635
+ }
636
  }
637
  AT_PRINTF("\n");
 
 
 
638
  }
639
  }
640
  }
641
 
642
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
643
  size_t hash_size = graph->visited_hash_table.size;
644
 
645
+ // initialize hash table
646
  if (galloc->hash_set.size < hash_size) {
647
+ free(galloc->hash_set.keys);
648
+ free(galloc->hash_values);
 
 
 
 
 
649
  galloc->hash_set.size = hash_size;
650
+ galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
651
+ galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
652
+ GGML_ASSERT(galloc->hash_set.keys != NULL);
653
+ GGML_ASSERT(galloc->hash_values != NULL);
654
+ } else {
655
+ // reset hash table
656
+ memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
657
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
658
  }
659
 
660
+ // reset allocators
661
+ for (int i = 0; i < galloc->n_buffers; i++) {
662
+ ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
663
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
664
 
665
+ // allocate in hash table
666
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
667
 
668
+ // set the node_allocs from the hash table
669
+ if (galloc->n_nodes < graph->n_nodes) {
670
+ free(galloc->node_allocs);
671
+ galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
672
+ GGML_ASSERT(galloc->node_allocs != NULL);
673
  }
674
+ galloc->n_nodes = graph->n_nodes;
675
+ for (int i = 0; i < graph->n_nodes; i++) {
676
+ struct ggml_tensor * node = graph->nodes[i];
677
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
678
+ node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
679
+ if (node->view_src || node->data) {
680
+ node_alloc->dst.offset = SIZE_MAX;
681
+ node_alloc->dst.size_max = 0;
682
+ } else {
683
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
684
+ node_alloc->dst.offset = hn->offset;
685
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
686
+ }
687
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
688
+ struct ggml_tensor * src = node->src[j];
689
+ if (!src || src->view_src || src->data) {
690
+ node_alloc->src[j].offset = SIZE_MAX;
691
+ node_alloc->src[j].size_max = 0;
692
+ } else {
693
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
694
+ node_alloc->src[j].offset = hn->offset;
695
+ node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
696
+ }
697
+ }
698
  }
 
699
 
700
+ // reallocate buffers if needed
701
+ for (int i = 0; i < galloc->n_buffers; i++) {
702
+ size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
703
+ size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
704
 
705
+ if (new_size > cur_size) {
706
+ #ifndef NDEBUG
707
+ fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
708
+ #endif
709
+ ggml_backend_buffer_free(galloc->buffers[i]);
710
+ galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
711
+ if (galloc->buffers[i] == NULL) {
712
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
713
+ return false;
714
+ }
715
+ }
716
+ }
717
 
718
+ return true;
 
 
719
  }
720
 
721
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
722
+ return ggml_gallocr_reserve_n(galloc, graph, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
723
  }
724
 
725
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
726
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
 
727
 
728
+ if (node->view_src != NULL) {
729
+ if (node->buffer == NULL) {
730
+ assert(tensor_alloc->offset == SIZE_MAX);
731
+ if (node->view_src->buffer == NULL) {
732
+ // this tensor was allocated without ggml-backend
733
+ return;
734
+ }
735
+ ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
736
+ }
737
+ } else {
738
+ if (node->data == NULL) {
739
+ assert(tensor_alloc->offset != SIZE_MAX);
740
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
741
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
742
+ void * addr = (char *)base + tensor_alloc->offset;
743
+ ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
744
+ } else {
745
+ if (node->buffer == NULL) {
746
+ // this tensor was allocated without ggml-backend
747
+ return;
748
+ }
749
 
750
+ #ifndef NDEBUG
751
+ size_t offset =
752
+ (char *)node->data -
753
+ (char *)ggml_backend_buffer_get_base(node->buffer);
754
+ size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
755
+ assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
756
+ assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
757
+ #endif
758
+ }
759
+ }
760
  }
761
 
762
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
763
+ ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
764
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
765
+ return talloc->size_max >= node_size;
766
  }
767
 
768
+ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
769
+ if (galloc->n_nodes != graph->n_nodes) {
770
+ #ifndef NDEBUG
771
+ fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
772
+ #endif
773
+ return true;
774
+ }
775
 
776
+ for (int i = 0; i < graph->n_nodes; i++) {
777
+ struct ggml_tensor * node = graph->nodes[i];
778
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
779
 
780
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
781
+ #ifndef NDEBUG
782
+ fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
783
+ #endif
784
+ return true;
785
+ }
786
 
787
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
788
+ struct ggml_tensor * src = node->src[j];
789
+ if (src == NULL) {
790
+ break;
791
+ }
792
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
795
+ #endif
796
+ return true;
797
+ }
798
+ }
799
  }
800
 
801
+ return false;
 
 
802
  }
803
 
804
+ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
805
+ if (ggml_gallocr_needs_realloc(galloc, graph)) {
806
+ if (galloc->n_buffers == 1) {
807
+ #ifndef NDEBUG
808
+ fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
809
+ #endif
810
+ if (!ggml_gallocr_reserve(galloc, graph)) {
811
+ return false;
812
+ }
813
+ } else {
814
+ #ifndef NDEBUG
815
+ fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
816
+ #endif
817
+ return false;
818
+ }
819
+ }
820
 
821
+ // reset buffers
822
+ for (int i = 0; i < galloc->n_buffers; i++) {
823
+ // zero size buffers are not allocated
824
+ if (galloc->buffers[i] != NULL) {
825
+ ggml_backend_buffer_reset(galloc->buffers[i]);
826
+ }
827
+ }
828
 
829
+ // allocate the graph tensors from the previous assignments
830
+ for (int i = 0; i < graph->n_nodes; i++) {
831
+ struct ggml_tensor * node = graph->nodes[i];
832
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
833
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
834
+ struct ggml_tensor * src = node->src[j];
835
+ if (src == NULL) {
836
+ break;
837
+ }
838
+ ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
839
+ }
840
+ ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
841
+ }
842
 
843
+ return true;
 
844
  }
845
 
846
+ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
847
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
848
+
849
+ if (galloc->buffers[buffer_id] == NULL) {
850
+ return 0;
851
+ }
852
+ return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
853
  }
854
 
855
  // utils
 
870
  return false;
871
  }
872
 
873
+ struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
874
 
875
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
876
  if (t->data == NULL) {
877
  if (t->view_src == NULL) {
878
  ggml_tallocr_alloc(tallocr, t);
879
+ } else if (t->buffer == NULL) {
880
  ggml_backend_view_init(buffer, t);
881
  }
882
  } else {
883
+ if (t->view_src != NULL && t->buffer == NULL) {
884
  // view of a pre-allocated tensor
885
  ggml_backend_view_init(buffer, t);
886
  }
 
913
  }
914
 
915
  if (this_size > max_size) {
 
916
  fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
917
  __func__, t->name,
918
  ggml_backend_buft_name(buft),
 
944
  }
945
 
946
  if (n_buffers == 0) {
 
947
  #ifndef NDEBUG
948
  fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
949
  #endif
ggml-alloc.h CHANGED
@@ -6,88 +6,62 @@
6
  extern "C" {
7
  #endif
8
 
9
- struct ggml_backend;
10
- struct ggml_backend_buffer;
11
- struct ggml_backend_buffer_type;
12
 
13
- //
14
- // Legacy API
15
- //
16
-
17
- typedef struct ggml_allocr * ggml_allocr_t;
18
-
19
- // initialize allocator for use with CPU backend only
20
- GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
21
- GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
22
-
23
- // initialize allocator for use with ggml-backend
24
- GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
25
- GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
26
- GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
27
-
28
- GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
29
-
30
- // tell the allocator to parse nodes following the order described in the list
31
- // you should call this if your graph are optimized to execute out-of-order
32
- GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
33
-
34
- GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
35
- GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
36
- GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
37
- GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
38
- GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
39
-
40
- GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
41
 
42
- //
43
- // ggml-backend v2 API
44
- //
45
 
46
- // Separate tensor and graph allocator objects
47
- // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
48
- // The original API is kept as a wrapper around the new API
 
49
 
50
- // Tensor allocator
51
- typedef struct ggml_tallocr * ggml_tallocr_t;
52
 
53
- GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
54
- GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
55
- GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
56
- GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
57
- GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
58
- GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
59
- GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
60
 
61
- GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
62
 
63
- GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
64
- GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
65
- GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
66
- GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
67
- GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
68
 
 
 
 
69
 
70
- // Graph allocator
71
  typedef struct ggml_gallocr * ggml_gallocr_t;
72
 
73
- GGML_API ggml_gallocr_t ggml_gallocr_new(void);
74
- GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
 
75
 
76
- GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
77
- GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
 
 
 
 
78
 
79
- // Allocate tensors from the allocators given by the hash table
80
- GGML_API void ggml_gallocr_alloc_graph_n(
81
- ggml_gallocr_t galloc,
82
- struct ggml_cgraph * graph,
83
- struct ggml_hash_set hash_set,
84
- ggml_tallocr_t * hash_node_talloc);
85
 
 
86
 
87
  // Utils
88
  // Create a buffer and allocate all the tensors in a ggml_context
89
- GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
90
- GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
91
 
92
  #ifdef __cplusplus
93
  }
 
6
  extern "C" {
7
  #endif
8
 
9
+ typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
10
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
+ typedef struct ggml_backend * ggml_backend_t;
12
 
13
+ // Tensor allocator
14
+ typedef struct ggml_tallocr * ggml_tallocr_t;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
17
+ GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
18
+ GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
19
 
20
+ // Graph allocator
21
+ /*
22
+ Example usage:
23
+ ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
24
 
25
+ // optional: create a worst-case graph and reserve the buffers to avoid reallocations
26
+ ggml_gallocr_reserve(galloc, build_graph(max_batch));
27
 
28
+ // allocate the graph
29
+ struct ggml_cgraph * graph = build_graph(batch);
30
+ ggml_gallocr_alloc_graph(galloc, graph);
 
 
 
 
31
 
32
+ printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
33
 
34
+ // evaluate the graph
35
+ ggml_backend_graph_compute(backend, graph);
36
+ */
 
 
37
 
38
+ // special tensor flags for use with the graph allocator:
39
+ // ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
40
+ // ggml_set_output(): output tensors are never freed and never overwritten
41
 
 
42
  typedef struct ggml_gallocr * ggml_gallocr_t;
43
 
44
+ GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
45
+ GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
46
+ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
47
 
48
+ // pre-allocate buffers from a measure graph - does not allocate or modify the graph
49
+ // call with a worst-case graph to avoid buffer reallocations
50
+ // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
51
+ // returns false if the buffer allocation failed
52
+ GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
53
+ GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
54
 
55
+ // automatic reallocation if the topology changes when using a single buffer
56
+ // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
57
+ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
 
 
 
58
 
59
+ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
60
 
61
  // Utils
62
  // Create a buffer and allocate all the tensors in a ggml_context
63
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
64
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
65
 
66
  #ifdef __cplusplus
67
  }
ggml-backend.c CHANGED
@@ -475,6 +475,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
475
 
476
  // backend CPU
477
 
 
 
478
  GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
479
  return "CPU";
480
 
@@ -482,7 +484,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
482
  }
483
 
484
  GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
485
- return (void *)buffer->context;
 
 
 
 
 
 
 
486
  }
487
 
488
  GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -540,8 +549,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
540
  /* .reset = */ NULL,
541
  };
542
 
543
- static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
544
-
545
  GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
546
  return "CPU";
547
 
@@ -550,9 +557,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
550
 
551
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
552
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
553
- void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
554
-
555
- GGML_ASSERT(data != NULL && "failed to allocate buffer");
 
 
556
 
557
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
558
  }
@@ -766,6 +775,9 @@ static struct ggml_backend_i cpu_backend_i = {
766
 
767
  ggml_backend_t ggml_backend_cpu_init(void) {
768
  struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
 
 
 
769
 
770
  ctx->n_threads = GGML_DEFAULT_N_THREADS;
771
  ctx->work_data = NULL;
@@ -774,6 +786,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
774
  ctx->abort_callback_data = NULL;
775
 
776
  ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
 
 
 
 
777
 
778
  *cpu_backend = (struct ggml_backend) {
779
  /* .interface = */ cpu_backend_i,
@@ -865,6 +881,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
865
  ctx->n_buffers = n_buffers;
866
  ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
867
 
 
 
868
  size_t total_size = 0;
869
  for (size_t i = 0; i < n_buffers; i++) {
870
  ctx->buffers[i] = buffers[i];
@@ -886,6 +904,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
886
  }
887
  }
888
 
 
 
 
 
 
 
 
 
 
 
 
 
889
 
890
  // scheduler
891
 
@@ -894,7 +924,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
894
  #define GGML_MAX_SPLIT_INPUTS 16
895
 
896
  struct ggml_backend_sched_split {
897
- ggml_tallocr_t tallocr;
898
  int i_start;
899
  int i_end;
900
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
@@ -909,15 +939,17 @@ struct ggml_backend_sched {
909
  int n_backends;
910
  ggml_backend_t backends[GGML_MAX_BACKENDS];
911
  ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
912
- ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
913
 
914
  ggml_gallocr_t galloc;
915
 
916
  // hash keys of the nodes in the graph
917
  struct ggml_hash_set hash_set;
918
- // hash values (arrays of [hash_set.size])
919
- ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
920
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
 
 
 
921
 
922
  // copy of the graph with modified inputs
923
  struct ggml_cgraph * graph;
@@ -927,77 +959,46 @@ struct ggml_backend_sched {
927
 
928
  struct ggml_context * ctx;
929
 
 
 
 
930
  // align context_buffer to GGML_MEM_ALIGN
931
  #ifdef _MSC_VER
932
  __declspec(align(GGML_MEM_ALIGN))
933
  #else
934
  __attribute__((aligned(GGML_MEM_ALIGN)))
935
  #endif
936
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
937
-
938
- ggml_backend_sched_eval_callback callback_eval;
939
- void * callback_eval_user_data;
940
  };
941
 
942
  #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
943
- #define node_allocr(node) sched->node_talloc[hash_id(node)]
944
-
945
- static bool ggml_is_view_op(enum ggml_op op) {
946
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
947
- }
948
 
949
- // returns the priority of the backend, lower is better
950
- static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
951
  for (int i = 0; i < sched->n_backends; i++) {
952
  if (sched->backends[i] == backend) {
953
  return i;
954
  }
955
  }
956
- return INT_MAX;
957
  }
958
 
959
- static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
960
- for (int i = 0; i < sched->n_backends; i++) {
961
- if (sched->tallocs[i] == allocr) {
962
- return i;
963
- }
964
- }
965
- return INT_MAX;
966
- }
967
-
968
- static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
969
  if (buffer == NULL) {
970
- return NULL;
971
- }
972
-
973
- // check if this is already allocate in a allocr buffer (from user manual allocations)
974
- for (int i = 0; i < sched->n_backends; i++) {
975
- if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
976
- return sched->tallocs[i];
977
- }
978
  }
979
 
980
  // find highest prio backend that supports the buffer type
981
  for (int i = 0; i < sched->n_backends; i++) {
982
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
983
- return sched->tallocs[i];
984
  }
985
  }
986
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
987
  }
988
 
989
- static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
990
- if (allocr == NULL) {
991
- return NULL;
992
- }
993
- for (int i = 0; i < sched->n_backends; i++) {
994
- if (sched->tallocs[i] == allocr) {
995
- return sched->backends[i];
996
- }
997
- }
998
- GGML_UNREACHABLE();
999
- }
1000
-
1001
  #if 0
1002
  static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
1003
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
@@ -1008,37 +1009,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
1008
  #endif
1009
 
1010
  // returns the backend that should be used for the node based on the current locations
1011
- static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
 
 
1012
  // assign pre-allocated nodes to their backend
1013
  // dst
1014
- ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
1015
- if (cur_allocr != NULL) {
1016
  SET_CAUSE(node, "1.dst");
1017
- return cur_allocr;
1018
  }
1019
  // view_src
1020
- if (node->view_src != NULL) {
1021
- cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
1022
- if (cur_allocr != NULL) {
1023
  SET_CAUSE(node, "1.vsrc");
1024
- return cur_allocr;
1025
  }
1026
  }
1027
  // assign nodes that use weights to the backend of the weights
1028
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1029
- const struct ggml_tensor * src = node->src[i];
1030
  if (src == NULL) {
1031
  break;
1032
  }
1033
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1034
- ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
1035
  // operations with weights are always run on the same backend as the weights
1036
  SET_CAUSE(node, "1.wgt%d", i);
1037
- return src_allocr;
1038
  }
1039
  }
1040
 
1041
- return NULL;
1042
  }
1043
 
1044
  static char * fmt_size(size_t size) {
@@ -1051,11 +1054,11 @@ static char * fmt_size(size_t size) {
1051
  return buffer;
1052
  }
1053
 
1054
- static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1055
  int cur_split = 0;
1056
  for (int i = 0; i < graph->n_nodes; i++) {
1057
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1058
- ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
1059
  fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1060
  sched->splits[cur_split].n_inputs);
1061
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
@@ -1069,17 +1072,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
1069
  if (ggml_is_view_op(node->op)) {
1070
  continue;
1071
  }
1072
- ggml_tallocr_t node_allocr = node_allocr(node);
1073
- ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
1074
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1075
- fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
1076
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1077
  struct ggml_tensor * src = node->src[j];
1078
  if (src == NULL) {
1079
  break;
1080
  }
1081
- ggml_tallocr_t src_allocr = node_allocr(src);
1082
- ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
1083
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1084
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1085
  }
@@ -1087,23 +1088,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
1087
  }
1088
  }
1089
 
1090
- // creates a copy of the tensor with the same memory layout
1091
- static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
1092
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
1093
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
1094
- dup->nb[i] = tensor->nb[i];
1095
- }
1096
- return dup;
1097
- }
1098
-
1099
-
1100
  //#define DEBUG_PASS1
1101
  //#define DEBUG_PASS2
1102
  //#define DEBUG_PASS3
1103
  //#define DEBUG_PASS4
1104
 
1105
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1106
- static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1107
  // reset splits
1108
  sched->n_splits = 0;
1109
  sched->is_reset = false;
@@ -1125,28 +1116,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1125
  // pass 1: assign backends to ops with pre-allocated inputs
1126
  for (int i = 0; i < graph->n_leafs; i++) {
1127
  struct ggml_tensor * leaf = graph->leafs[i];
1128
- if (node_allocr(leaf) != NULL) {
1129
  // do not overwrite user assignments
1130
  continue;
1131
  }
1132
- node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
1133
  }
1134
 
1135
  for (int i = 0; i < graph->n_nodes; i++) {
1136
  struct ggml_tensor * node = graph->nodes[i];
1137
- if (node_allocr(node) != NULL) {
1138
  // do not overwrite user assignments
1139
  continue;
1140
  }
1141
- node_allocr(node) = sched_allocr_from_cur(sched, node);
1142
  // src
1143
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1144
  struct ggml_tensor * src = node->src[j];
1145
  if (src == NULL) {
1146
  break;
1147
  }
1148
- if (node_allocr(src) == NULL) {
1149
- node_allocr(src) = sched_allocr_from_cur(sched, src);
1150
  }
1151
  }
1152
  }
@@ -1161,22 +1152,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1161
 
1162
  // pass 2.1 expand gpu up
1163
  {
1164
- ggml_tallocr_t cur_allocr = NULL;
1165
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1166
  struct ggml_tensor * node = graph->nodes[i];
1167
  if (ggml_is_view_op(node->op)) {
1168
  continue;
1169
  }
1170
- ggml_tallocr_t node_allocr = node_allocr(node);
1171
- if (node_allocr != NULL) {
1172
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1173
  // skip cpu (lowest prio backend)
1174
- cur_allocr = NULL;
1175
  } else {
1176
- cur_allocr = node_allocr;
1177
  }
1178
  } else {
1179
- node_allocr(node) = cur_allocr;
1180
  SET_CAUSE(node, "2.1");
1181
  }
1182
  }
@@ -1184,22 +1175,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1184
 
1185
  // pass 2.2 expand gpu down
1186
  {
1187
- ggml_tallocr_t cur_allocr = NULL;
1188
  for (int i = 0; i < graph->n_nodes; i++) {
1189
  struct ggml_tensor * node = graph->nodes[i];
1190
  if (ggml_is_view_op(node->op)) {
1191
  continue;
1192
  }
1193
- ggml_tallocr_t node_allocr = node_allocr(node);
1194
- if (node_allocr != NULL) {
1195
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1196
  // skip cpu (lowest prio backend)
1197
- cur_allocr = NULL;
1198
  } else {
1199
- cur_allocr = node_allocr;
1200
  }
1201
  } else {
1202
- node_allocr(node) = cur_allocr;
1203
  SET_CAUSE(node, "2.2");
1204
  }
1205
  }
@@ -1207,17 +1198,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1207
 
1208
  // pass 2.3 expand rest up
1209
  {
1210
- ggml_tallocr_t cur_allocr = NULL;
1211
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1212
  struct ggml_tensor * node = graph->nodes[i];
1213
  if (ggml_is_view_op(node->op)) {
1214
  continue;
1215
  }
1216
- ggml_tallocr_t node_allocr = node_allocr(node);
1217
- if (node_allocr != NULL) {
1218
- cur_allocr = node_allocr;
1219
  } else {
1220
- node_allocr(node) = cur_allocr;
1221
  SET_CAUSE(node, "2.3");
1222
  }
1223
  }
@@ -1225,17 +1216,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1225
 
1226
  // pass 2.4 expand rest down
1227
  {
1228
- ggml_tallocr_t cur_allocr = NULL;
1229
  for (int i = 0; i < graph->n_nodes; i++) {
1230
  struct ggml_tensor * node = graph->nodes[i];
1231
  if (ggml_is_view_op(node->op)) {
1232
  continue;
1233
  }
1234
- ggml_tallocr_t node_allocr = node_allocr(node);
1235
- if (node_allocr != NULL) {
1236
- cur_allocr = node_allocr;
1237
  } else {
1238
- node_allocr(node) = cur_allocr;
1239
  SET_CAUSE(node, "2.4");
1240
  }
1241
  }
@@ -1247,9 +1238,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1247
  // pass 3: assign backends to remaining src from dst and view_src
1248
  for (int i = 0; i < graph->n_nodes; i++) {
1249
  struct ggml_tensor * node = graph->nodes[i];
1250
- ggml_tallocr_t cur_allocr = node_allocr(node);
1251
- if (node->view_src != NULL && cur_allocr == NULL) {
1252
- cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1253
  SET_CAUSE(node, "3.vsrc");
1254
  }
1255
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1257,14 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1257
  if (src == NULL) {
1258
  break;
1259
  }
1260
- ggml_tallocr_t src_allocr = node_allocr(src);
1261
- if (src_allocr == NULL) {
1262
  if (src->view_src != NULL) {
1263
  // views are always on the same backend as the source
1264
- node_allocr(src) = node_allocr(src->view_src);
1265
  SET_CAUSE(src, "3.vsrc");
1266
  } else {
1267
- node_allocr(src) = cur_allocr;
1268
  SET_CAUSE(src, "3.cur");
1269
  }
1270
  }
@@ -1281,15 +1272,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1281
  for (int i = 0; i < graph->n_nodes; i++) {
1282
  struct ggml_tensor * node = graph->nodes[i];
1283
  if (!ggml_is_view_op(node->op)) {
1284
- sched->splits[0].tallocr = node_allocr(node);
1285
  break;
1286
  }
1287
  }
1288
  sched->splits[0].i_start = 0;
1289
  sched->splits[0].n_inputs = 0;
1290
  memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1291
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1292
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1293
  for (int i = 0; i < graph->n_nodes; i++) {
1294
  struct ggml_tensor * node = graph->nodes[i];
1295
 
@@ -1297,19 +1287,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1297
  continue;
1298
  }
1299
 
1300
- ggml_tallocr_t node_allocr = node_allocr(node);
1301
 
1302
- GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
1303
 
1304
- if (node_allocr != cur_allocr) {
1305
  sched->splits[cur_split].i_end = i;
1306
  cur_split++;
1307
  GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1308
- sched->splits[cur_split].tallocr = node_allocr;
1309
  sched->splits[cur_split].i_start = i;
1310
  sched->splits[cur_split].n_inputs = 0;
1311
- cur_allocr = node_allocr;
1312
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1313
  }
1314
 
1315
  // find inputs that are not on the same backend
@@ -1318,43 +1307,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1318
  if (src == NULL) {
1319
  break;
1320
  }
1321
- ggml_tallocr_t src_allocr = node_allocr(src);
1322
- GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1323
- if (src_allocr != node_allocr) {
1324
  // create a copy of the input in the split's backend
1325
  size_t id = hash_id(src);
1326
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1327
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1328
  struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1329
  ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1330
 
1331
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1332
- node_allocr(tensor_copy) = cur_allocr;
1333
  SET_CAUSE(tensor_copy, "4.cpy");
1334
 
1335
  int n_inputs = sched->splits[cur_split].n_inputs++;
1336
  GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1337
  sched->splits[cur_split].inputs[n_inputs] = src;
1338
  }
1339
- node->src[j] = sched->node_copies[id][cur_backend_id];
1340
-
1341
- #if 0
1342
- // check if the input is already in the split
1343
- bool found = false;
1344
- for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1345
- if (sched->splits[cur_split].inputs[k] == src) {
1346
- found = true;
1347
- break;
1348
- }
1349
- }
1350
-
1351
- if (!found) {
1352
- int n_inputs = sched->splits[cur_split].n_inputs++;
1353
- //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1354
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1355
- sched->splits[cur_split].inputs[n_inputs] = src;
1356
- }
1357
- #endif
1358
  }
1359
  }
1360
  }
@@ -1369,30 +1340,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1369
  // sanity check: all sources should have the same backend as the node
1370
  for (int i = 0; i < graph->n_nodes; i++) {
1371
  struct ggml_tensor * node = graph->nodes[i];
1372
- ggml_tallocr_t node_allocr = node_allocr(node);
1373
- if (node_allocr == NULL) {
1374
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1375
  }
1376
- if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1377
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1378
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1379
- node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1380
  }
1381
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1382
  struct ggml_tensor * src = node->src[j];
1383
  if (src == NULL) {
1384
  break;
1385
  }
1386
- ggml_tallocr_t src_allocr = node_allocr(src);
1387
- if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
1388
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1389
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1390
- j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1391
  }
1392
- if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1393
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1394
- src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1395
- src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1396
  }
1397
  }
1398
  }
@@ -1406,32 +1377,43 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1406
  struct ggml_backend_sched_split * split = &sched->splits[i];
1407
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1408
 
1409
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1410
  for (int j = 0; j < split->n_inputs; j++) {
1411
  struct ggml_tensor * input = split->inputs[j];
1412
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
 
1413
  // add a dependency to the input source so that it is not freed before the copy is done
1414
- GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
1415
- input_cpy->src[0] = input;
 
 
 
 
1416
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1417
  }
1418
 
1419
  for (int j = split->i_start; j < split->i_end; j++) {
 
1420
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1421
  }
1422
  }
1423
  sched->graph = graph_copy;
1424
  }
1425
 
1426
- static void sched_alloc_splits(ggml_backend_sched_t sched) {
1427
- ggml_gallocr_alloc_graph_n(
1428
- sched->galloc,
1429
- sched->graph,
1430
- sched->hash_set,
1431
- sched->node_talloc);
 
 
 
 
 
 
1432
  }
1433
 
1434
- static void sched_compute_splits(ggml_backend_sched_t sched) {
1435
  uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1436
  uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1437
 
@@ -1439,20 +1421,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1439
 
1440
  for (int i = 0; i < sched->n_splits; i++) {
1441
  struct ggml_backend_sched_split * split = &splits[i];
1442
- ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
1443
- int split_backend_id = sched_backend_prio(sched, split_backend);
1444
 
1445
  // copy the input tensors to the split backend
1446
  uint64_t copy_start_us = ggml_time_us();
1447
  for (int j = 0; j < split->n_inputs; j++) {
1448
  struct ggml_tensor * input = split->inputs[j];
1449
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
1450
 
1451
  GGML_ASSERT(input->buffer != NULL);
1452
  GGML_ASSERT(input_cpy->buffer != NULL);
1453
 
1454
- // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1455
- // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1456
  ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1457
  }
1458
  //ggml_backend_synchronize(split_backend); // necessary to measure copy time
@@ -1468,7 +1448,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1468
 
1469
  uint64_t compute_start_us = ggml_time_us();
1470
  if (!sched->callback_eval) {
1471
- ggml_backend_graph_compute(split_backend, &split->graph);
 
 
1472
  //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1473
  } else {
1474
  // similar to ggml_backend_compare_graph_backend
@@ -1488,7 +1470,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1488
 
1489
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1490
 
1491
- ggml_backend_graph_compute(split_backend, &gv);
 
 
1492
 
1493
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1494
  break;
@@ -1510,19 +1494,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1510
  }
1511
  }
1512
  #endif
1513
- }
1514
-
1515
- static void sched_reset(ggml_backend_sched_t sched) {
1516
- for (int i = 0; i < sched->n_backends; i++) {
1517
- ggml_tallocr_reset(sched->tallocs[i]);
1518
- }
1519
- // reset state for the next run
1520
- size_t hash_size = sched->hash_set.size;
1521
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
1522
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
1523
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
1524
 
1525
- sched->is_reset = true;
1526
  }
1527
 
1528
  ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1532,9 +1505,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
1532
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1533
 
1534
  // initialize hash table
1535
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1536
- sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
1537
- sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
 
1538
 
1539
  sched->n_backends = n_backends;
1540
  for (int i = 0; i < n_backends; i++) {
@@ -1542,14 +1516,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
1542
  sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1543
  }
1544
 
1545
- sched->galloc = ggml_gallocr_new();
1546
 
1547
- // init measure allocs for each backend
1548
- for (int i = 0; i < n_backends; i++) {
1549
- sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
1550
- }
1551
-
1552
- sched_reset(sched);
1553
 
1554
  return sched;
1555
  }
@@ -1558,49 +1527,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1558
  if (sched == NULL) {
1559
  return;
1560
  }
1561
- for (int i = 0; i < sched->n_backends; i++) {
1562
- ggml_tallocr_free(sched->tallocs[i]);
1563
- }
1564
  ggml_gallocr_free(sched->galloc);
1565
  ggml_free(sched->ctx);
1566
  free(sched->hash_set.keys);
1567
- free(sched->node_talloc);
1568
- free(sched->node_copies);
 
1569
  free(sched);
1570
  }
1571
 
1572
- void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1573
- GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
 
 
 
 
1574
 
1575
- sched_split_graph(sched, measure_graph);
1576
- sched_alloc_splits(sched);
1577
 
1578
- // allocate buffers and reset allocators
1579
- for (int i = 0; i < sched->n_backends; i++) {
1580
- size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1581
- ggml_tallocr_free(sched->tallocs[i]);
1582
- sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
1583
  }
1584
 
1585
- sched_reset(sched);
 
1586
  }
1587
 
1588
- void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1589
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1590
 
1591
  if (!sched->is_reset) {
1592
- sched_reset(sched);
1593
  }
1594
 
1595
- sched_split_graph(sched, graph);
1596
- sched_alloc_splits(sched);
1597
- sched_compute_splits(sched);
1598
- }
1599
 
1600
- void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1601
- sched_reset(sched);
1602
- }
1603
 
 
 
1604
 
1605
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1606
  sched->callback_eval = callback;
@@ -1611,37 +1585,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1611
  return sched->n_splits;
1612
  }
1613
 
1614
- ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1615
- int backend_index = sched_backend_prio(sched, backend);
1616
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1617
- return sched->tallocs[backend_index];
1618
- }
1619
-
1620
- ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1621
- int backend_index = sched_backend_prio(sched, backend);
1622
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1623
- return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1624
  }
1625
 
1626
  void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1627
- int backend_index = sched_backend_prio(sched, backend);
1628
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1629
- node_allocr(node) = sched->tallocs[backend_index];
1630
  }
1631
 
1632
  ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1633
- ggml_tallocr_t allocr = node_allocr(node);
1634
- if (allocr == NULL) {
1635
  return NULL;
1636
  }
1637
- return get_allocr_backend(sched, allocr);
1638
  }
1639
 
1640
  // utils
1641
 
1642
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1643
  GGML_ASSERT(tensor->buffer == NULL);
1644
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
1645
  GGML_ASSERT(tensor->view_src != NULL);
1646
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1647
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1665,7 +1632,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
1665
  ggml_backend_buffer_init_tensor(buffer, tensor);
1666
  }
1667
 
1668
- static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1669
  struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1670
 
1671
  GGML_ASSERT(src != NULL);
@@ -1678,7 +1645,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
1678
 
1679
  struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1680
  if (src->view_src != NULL) {
1681
- dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1682
  dst->view_offs = src->view_offs;
1683
  }
1684
  dst->op = src->op;
@@ -1691,14 +1658,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
1691
  if (s == NULL) {
1692
  break;
1693
  }
1694
- dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1695
  }
1696
 
1697
  node_copies[id] = dst;
1698
  return dst;
1699
  }
1700
 
1701
- static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1702
  size_t id = ggml_hash_find(hash_set, src);
1703
  if (node_init[id]) {
1704
  return;
@@ -1707,7 +1674,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1707
 
1708
  struct ggml_tensor * dst = node_copies[id];
1709
  if (dst->view_src != NULL) {
1710
- graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
1711
  ggml_backend_view_init(dst->view_src->buffer, dst);
1712
  }
1713
  else {
@@ -1720,17 +1687,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1720
  if (s == NULL) {
1721
  break;
1722
  }
1723
- graph_init_tensor(hash_set, node_copies, node_init, s);
1724
  }
1725
  }
1726
 
1727
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1728
  struct ggml_hash_set hash_set = {
1729
  /* .size = */ graph->visited_hash_table.size,
1730
- /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
1731
  };
1732
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
1733
- bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
1734
 
1735
  struct ggml_init_params params = {
1736
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -1759,7 +1726,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1759
  // dup nodes
1760
  for (int i = 0; i < graph->n_nodes; i++) {
1761
  struct ggml_tensor * node = graph->nodes[i];
1762
- graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1763
  }
1764
 
1765
  // allocate nodes
@@ -1784,7 +1751,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1784
  // copy data and init views
1785
  for (int i = 0; i < graph->n_nodes; i++) {
1786
  struct ggml_tensor * node = graph->nodes[i];
1787
- graph_init_tensor(hash_set, node_copies, node_init, node);
1788
  }
1789
 
1790
  // build graph copy
 
475
 
476
  // backend CPU
477
 
478
+ static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
479
+
480
  GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
481
  return "CPU";
482
 
 
484
  }
485
 
486
  GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
487
+ uintptr_t data = (uintptr_t)buffer->context;
488
+
489
+ // align the buffer
490
+ if (data % TENSOR_ALIGNMENT != 0) {
491
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
492
+ }
493
+
494
+ return (void *)data;
495
  }
496
 
497
  GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 
549
  /* .reset = */ NULL,
550
  };
551
 
 
 
552
  GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
553
  return "CPU";
554
 
 
557
 
558
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
559
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
560
+ void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
561
+ if (data == NULL) {
562
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
563
+ return NULL;
564
+ }
565
 
566
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
567
  }
 
775
 
776
  ggml_backend_t ggml_backend_cpu_init(void) {
777
  struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
778
+ if (ctx == NULL) {
779
+ return NULL;
780
+ }
781
 
782
  ctx->n_threads = GGML_DEFAULT_N_THREADS;
783
  ctx->work_data = NULL;
 
786
  ctx->abort_callback_data = NULL;
787
 
788
  ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
789
+ if (cpu_backend == NULL) {
790
+ free(ctx);
791
+ return NULL;
792
+ }
793
 
794
  *cpu_backend = (struct ggml_backend) {
795
  /* .interface = */ cpu_backend_i,
 
881
  ctx->n_buffers = n_buffers;
882
  ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
883
 
884
+ GGML_ASSERT(ctx->buffers != NULL);
885
+
886
  size_t total_size = 0;
887
  for (size_t i = 0; i < n_buffers; i++) {
888
  ctx->buffers[i] = buffers[i];
 
904
  }
905
  }
906
 
907
+ // creates a copy of the tensor with the same memory layout
908
+ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
909
+ struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
910
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
911
+ dup->nb[i] = tensor->nb[i];
912
+ }
913
+ return dup;
914
+ }
915
+
916
+ static bool ggml_is_view_op(enum ggml_op op) {
917
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
918
+ }
919
 
920
  // scheduler
921
 
 
924
  #define GGML_MAX_SPLIT_INPUTS 16
925
 
926
  struct ggml_backend_sched_split {
927
+ int backend_id;
928
  int i_start;
929
  int i_end;
930
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
 
939
  int n_backends;
940
  ggml_backend_t backends[GGML_MAX_BACKENDS];
941
  ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
 
942
 
943
  ggml_gallocr_t galloc;
944
 
945
  // hash keys of the nodes in the graph
946
  struct ggml_hash_set hash_set;
947
+ // hash values
948
+ int * tensor_backend_id;
949
+ struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
950
+
951
+ int * node_backend_ids; // [n_nodes]
952
+ int n_nodes;
953
 
954
  // copy of the graph with modified inputs
955
  struct ggml_cgraph * graph;
 
959
 
960
  struct ggml_context * ctx;
961
 
962
+ ggml_backend_sched_eval_callback callback_eval;
963
+ void * callback_eval_user_data;
964
+
965
  // align context_buffer to GGML_MEM_ALIGN
966
  #ifdef _MSC_VER
967
  __declspec(align(GGML_MEM_ALIGN))
968
  #else
969
  __attribute__((aligned(GGML_MEM_ALIGN)))
970
  #endif
971
+ char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
 
 
 
972
  };
973
 
974
  #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
975
+ #define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
976
+ #define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
 
 
 
977
 
978
+ // returns the priority of the backend, lower id is higher priority
979
+ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
980
  for (int i = 0; i < sched->n_backends; i++) {
981
  if (sched->backends[i] == backend) {
982
  return i;
983
  }
984
  }
985
+ return -1;
986
  }
987
 
988
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
 
 
 
 
 
 
 
 
 
989
  if (buffer == NULL) {
990
+ return -1;
 
 
 
 
 
 
 
991
  }
992
 
993
  // find highest prio backend that supports the buffer type
994
  for (int i = 0; i < sched->n_backends; i++) {
995
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
996
+ return i;
997
  }
998
  }
999
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1000
  }
1001
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
  #if 0
1003
  static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
1004
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 
1009
  #endif
1010
 
1011
  // returns the backend that should be used for the node based on the current locations
1012
+ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1013
+ // TODO: use supports_op to check if the backend supports the op
1014
+
1015
  // assign pre-allocated nodes to their backend
1016
  // dst
1017
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
1018
+ if (cur_backend != -1) {
1019
  SET_CAUSE(node, "1.dst");
1020
+ return cur_backend;
1021
  }
1022
  // view_src
1023
+ if (tensor->view_src != NULL) {
1024
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
1025
+ if (cur_backend != -1) {
1026
  SET_CAUSE(node, "1.vsrc");
1027
+ return cur_backend;
1028
  }
1029
  }
1030
  // assign nodes that use weights to the backend of the weights
1031
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1032
+ const struct ggml_tensor * src = tensor->src[i];
1033
  if (src == NULL) {
1034
  break;
1035
  }
1036
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1037
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
1038
  // operations with weights are always run on the same backend as the weights
1039
  SET_CAUSE(node, "1.wgt%d", i);
1040
+ return src_backend;
1041
  }
1042
  }
1043
 
1044
+ return -1;
1045
  }
1046
 
1047
  static char * fmt_size(size_t size) {
 
1054
  return buffer;
1055
  }
1056
 
1057
+ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1058
  int cur_split = 0;
1059
  for (int i = 0; i < graph->n_nodes; i++) {
1060
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1061
+ ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1062
  fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1063
  sched->splits[cur_split].n_inputs);
1064
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
 
1072
  if (ggml_is_view_op(node->op)) {
1073
  continue;
1074
  }
1075
+ ggml_backend_t tensor_backend = tensor_backend(node);
 
1076
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1077
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1078
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1079
  struct ggml_tensor * src = node->src[j];
1080
  if (src == NULL) {
1081
  break;
1082
  }
1083
+ ggml_backend_t src_backend = tensor_backend(src);
 
1084
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1085
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1086
  }
 
1088
  }
1089
  }
1090
 
 
 
 
 
 
 
 
 
 
 
1091
  //#define DEBUG_PASS1
1092
  //#define DEBUG_PASS2
1093
  //#define DEBUG_PASS3
1094
  //#define DEBUG_PASS4
1095
 
1096
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1097
+ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1098
  // reset splits
1099
  sched->n_splits = 0;
1100
  sched->is_reset = false;
 
1116
  // pass 1: assign backends to ops with pre-allocated inputs
1117
  for (int i = 0; i < graph->n_leafs; i++) {
1118
  struct ggml_tensor * leaf = graph->leafs[i];
1119
+ if (tensor_backend_id(leaf) != -1) {
1120
  // do not overwrite user assignments
1121
  continue;
1122
  }
1123
+ tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1124
  }
1125
 
1126
  for (int i = 0; i < graph->n_nodes; i++) {
1127
  struct ggml_tensor * node = graph->nodes[i];
1128
+ if (tensor_backend_id(node) != -1) {
1129
  // do not overwrite user assignments
1130
  continue;
1131
  }
1132
+ tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
1133
  // src
1134
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1135
  struct ggml_tensor * src = node->src[j];
1136
  if (src == NULL) {
1137
  break;
1138
  }
1139
+ if (tensor_backend_id(src) == -1) {
1140
+ tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
1141
  }
1142
  }
1143
  }
 
1152
 
1153
  // pass 2.1 expand gpu up
1154
  {
1155
+ int cur_backend_id = -1;
1156
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1157
  struct ggml_tensor * node = graph->nodes[i];
1158
  if (ggml_is_view_op(node->op)) {
1159
  continue;
1160
  }
1161
+ int tensor_backend_id = tensor_backend_id(node);
1162
+ if (tensor_backend_id != -1) {
1163
+ if (tensor_backend_id == sched->n_backends - 1) {
1164
  // skip cpu (lowest prio backend)
1165
+ cur_backend_id = -1;
1166
  } else {
1167
+ cur_backend_id = tensor_backend_id;
1168
  }
1169
  } else {
1170
+ tensor_backend_id(node) = cur_backend_id;
1171
  SET_CAUSE(node, "2.1");
1172
  }
1173
  }
 
1175
 
1176
  // pass 2.2 expand gpu down
1177
  {
1178
+ int cur_backend_id = -1;
1179
  for (int i = 0; i < graph->n_nodes; i++) {
1180
  struct ggml_tensor * node = graph->nodes[i];
1181
  if (ggml_is_view_op(node->op)) {
1182
  continue;
1183
  }
1184
+ int tensor_backend_id = tensor_backend_id(node);
1185
+ if (tensor_backend_id != -1) {
1186
+ if (tensor_backend_id == sched->n_backends - 1) {
1187
  // skip cpu (lowest prio backend)
1188
+ cur_backend_id = -1;
1189
  } else {
1190
+ cur_backend_id = tensor_backend_id;
1191
  }
1192
  } else {
1193
+ tensor_backend_id(node) = cur_backend_id;
1194
  SET_CAUSE(node, "2.2");
1195
  }
1196
  }
 
1198
 
1199
  // pass 2.3 expand rest up
1200
  {
1201
+ int cur_backend_id = -1;
1202
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1203
  struct ggml_tensor * node = graph->nodes[i];
1204
  if (ggml_is_view_op(node->op)) {
1205
  continue;
1206
  }
1207
+ int tensor_backend_id = tensor_backend_id(node);
1208
+ if (tensor_backend_id != -1) {
1209
+ cur_backend_id = tensor_backend_id;
1210
  } else {
1211
+ tensor_backend_id(node) = cur_backend_id;
1212
  SET_CAUSE(node, "2.3");
1213
  }
1214
  }
 
1216
 
1217
  // pass 2.4 expand rest down
1218
  {
1219
+ int cur_backend_id = -1;
1220
  for (int i = 0; i < graph->n_nodes; i++) {
1221
  struct ggml_tensor * node = graph->nodes[i];
1222
  if (ggml_is_view_op(node->op)) {
1223
  continue;
1224
  }
1225
+ int tensor_backend_id = tensor_backend_id(node);
1226
+ if (tensor_backend_id != -1) {
1227
+ cur_backend_id = tensor_backend_id;
1228
  } else {
1229
+ tensor_backend_id(node) = cur_backend_id;
1230
  SET_CAUSE(node, "2.4");
1231
  }
1232
  }
 
1238
  // pass 3: assign backends to remaining src from dst and view_src
1239
  for (int i = 0; i < graph->n_nodes; i++) {
1240
  struct ggml_tensor * node = graph->nodes[i];
1241
+ int cur_backend_id = tensor_backend_id(node);
1242
+ if (node->view_src != NULL && cur_backend_id == -1) {
1243
+ cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
1244
  SET_CAUSE(node, "3.vsrc");
1245
  }
1246
  for (int j = 0; j < GGML_MAX_SRC; j++) {
 
1248
  if (src == NULL) {
1249
  break;
1250
  }
1251
+ int src_backend_id = tensor_backend_id(src);
1252
+ if (src_backend_id == -1) {
1253
  if (src->view_src != NULL) {
1254
  // views are always on the same backend as the source
1255
+ tensor_backend_id(src) = tensor_backend_id(src->view_src);
1256
  SET_CAUSE(src, "3.vsrc");
1257
  } else {
1258
+ tensor_backend_id(src) = cur_backend_id;
1259
  SET_CAUSE(src, "3.cur");
1260
  }
1261
  }
 
1272
  for (int i = 0; i < graph->n_nodes; i++) {
1273
  struct ggml_tensor * node = graph->nodes[i];
1274
  if (!ggml_is_view_op(node->op)) {
1275
+ sched->splits[0].backend_id = tensor_backend_id(node);
1276
  break;
1277
  }
1278
  }
1279
  sched->splits[0].i_start = 0;
1280
  sched->splits[0].n_inputs = 0;
1281
  memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1282
+ int cur_backend_id = sched->splits[0].backend_id;
 
1283
  for (int i = 0; i < graph->n_nodes; i++) {
1284
  struct ggml_tensor * node = graph->nodes[i];
1285
 
 
1287
  continue;
1288
  }
1289
 
1290
+ int tensor_backend_id = tensor_backend_id(node);
1291
 
1292
+ GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
1293
 
1294
+ if (tensor_backend_id != cur_backend_id) {
1295
  sched->splits[cur_split].i_end = i;
1296
  cur_split++;
1297
  GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1298
+ sched->splits[cur_split].backend_id = tensor_backend_id;
1299
  sched->splits[cur_split].i_start = i;
1300
  sched->splits[cur_split].n_inputs = 0;
1301
+ cur_backend_id = tensor_backend_id;
 
1302
  }
1303
 
1304
  // find inputs that are not on the same backend
 
1307
  if (src == NULL) {
1308
  break;
1309
  }
1310
+ int src_backend_id = tensor_backend_id(src);
1311
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1312
+ if (src_backend_id != tensor_backend_id) {
1313
  // create a copy of the input in the split's backend
1314
  size_t id = hash_id(src);
1315
+ if (sched->tensor_copies[id][cur_backend_id] == NULL) {
1316
+ ggml_backend_t backend = sched->backends[cur_backend_id];
1317
  struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1318
  ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1319
 
1320
+ sched->tensor_copies[id][cur_backend_id] = tensor_copy;
1321
+ tensor_backend_id(tensor_copy) = cur_backend_id;
1322
  SET_CAUSE(tensor_copy, "4.cpy");
1323
 
1324
  int n_inputs = sched->splits[cur_split].n_inputs++;
1325
  GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1326
  sched->splits[cur_split].inputs[n_inputs] = src;
1327
  }
1328
+ node->src[j] = sched->tensor_copies[id][cur_backend_id];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1329
  }
1330
  }
1331
  }
 
1340
  // sanity check: all sources should have the same backend as the node
1341
  for (int i = 0; i < graph->n_nodes; i++) {
1342
  struct ggml_tensor * node = graph->nodes[i];
1343
+ ggml_backend_t tensor_backend = tensor_backend(node);
1344
+ if (tensor_backend == NULL) {
1345
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1346
  }
1347
+ if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
1348
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1349
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1350
+ node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
1351
  }
1352
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1353
  struct ggml_tensor * src = node->src[j];
1354
  if (src == NULL) {
1355
  break;
1356
  }
1357
+ ggml_backend_t src_backend = tensor_backend(src);
1358
+ if (src_backend != tensor_backend /* && src_backend != NULL */) {
1359
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1360
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1361
+ j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1362
  }
1363
+ if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
1364
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1365
+ src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1366
+ src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
1367
  }
1368
  }
1369
  }
 
1377
  struct ggml_backend_sched_split * split = &sched->splits[i];
1378
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1379
 
 
1380
  for (int j = 0; j < split->n_inputs; j++) {
1381
  struct ggml_tensor * input = split->inputs[j];
1382
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
1383
+
1384
  // add a dependency to the input source so that it is not freed before the copy is done
1385
+ struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1386
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1387
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1388
+
1389
+ // add a dependency to the input copy so that it is allocated at the start of the split
1390
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1391
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1392
  }
1393
 
1394
  for (int j = split->i_start; j < split->i_end; j++) {
1395
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1396
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1397
  }
1398
  }
1399
  sched->graph = graph_copy;
1400
  }
1401
 
1402
+ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1403
+ // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1404
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1405
+ #ifndef NDEBUG
1406
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
1407
+ #endif
1408
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1409
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1410
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
1411
+ return false;
1412
+ }
1413
+ }
1414
  }
1415
 
1416
+ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1417
  uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1418
  uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1419
 
 
1421
 
1422
  for (int i = 0; i < sched->n_splits; i++) {
1423
  struct ggml_backend_sched_split * split = &splits[i];
1424
+ int split_backend_id = split->backend_id;
1425
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
1426
 
1427
  // copy the input tensors to the split backend
1428
  uint64_t copy_start_us = ggml_time_us();
1429
  for (int j = 0; j < split->n_inputs; j++) {
1430
  struct ggml_tensor * input = split->inputs[j];
1431
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
1432
 
1433
  GGML_ASSERT(input->buffer != NULL);
1434
  GGML_ASSERT(input_cpy->buffer != NULL);
1435
 
 
 
1436
  ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1437
  }
1438
  //ggml_backend_synchronize(split_backend); // necessary to measure copy time
 
1448
 
1449
  uint64_t compute_start_us = ggml_time_us();
1450
  if (!sched->callback_eval) {
1451
+ if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
1452
+ return false;
1453
+ }
1454
  //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1455
  } else {
1456
  // similar to ggml_backend_compare_graph_backend
 
1470
 
1471
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1472
 
1473
+ if (!ggml_backend_graph_compute(split_backend, &gv)) {
1474
+ return false;
1475
+ }
1476
 
1477
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1478
  break;
 
1494
  }
1495
  }
1496
  #endif
 
 
 
 
 
 
 
 
 
 
 
1497
 
1498
+ return true;
1499
  }
1500
 
1501
  ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
 
1505
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1506
 
1507
  // initialize hash table
1508
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1509
+ sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1510
+ sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1511
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1512
 
1513
  sched->n_backends = n_backends;
1514
  for (int i = 0; i < n_backends; i++) {
 
1516
  sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1517
  }
1518
 
1519
+ sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1520
 
1521
+ ggml_backend_sched_reset(sched);
 
 
 
 
 
1522
 
1523
  return sched;
1524
  }
 
1527
  if (sched == NULL) {
1528
  return;
1529
  }
 
 
 
1530
  ggml_gallocr_free(sched->galloc);
1531
  ggml_free(sched->ctx);
1532
  free(sched->hash_set.keys);
1533
+ free(sched->tensor_backend_id);
1534
+ free(sched->tensor_copies);
1535
+ free(sched->node_backend_ids);
1536
  free(sched);
1537
  }
1538
 
1539
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1540
+ // reset state for the next run
1541
+ size_t hash_size = sched->hash_set.size;
1542
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1543
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1544
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1545
 
1546
+ sched->is_reset = true;
1547
+ }
1548
 
1549
+ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1550
+ ggml_backend_sched_split_graph(sched, measure_graph);
1551
+
1552
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
1553
+ return false;
1554
  }
1555
 
1556
+ ggml_backend_sched_reset(sched);
1557
+ return true;
1558
  }
1559
 
1560
+ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1561
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1562
 
1563
  if (!sched->is_reset) {
1564
+ ggml_backend_sched_reset(sched);
1565
  }
1566
 
1567
+ ggml_backend_sched_split_graph(sched, graph);
1568
+ if (!ggml_backend_sched_alloc_splits(sched)) {
1569
+ return false;
1570
+ }
1571
 
1572
+ if (!ggml_backend_sched_compute_splits(sched)) {
1573
+ return false;
1574
+ }
1575
 
1576
+ return true;
1577
+ }
1578
 
1579
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1580
  sched->callback_eval = callback;
 
1585
  return sched->n_splits;
1586
  }
1587
 
1588
+ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1589
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
 
 
 
 
 
 
1590
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1591
+ return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1592
  }
1593
 
1594
  void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1595
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1596
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1597
+ tensor_backend_id(node) = backend_index;
1598
  }
1599
 
1600
  ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1601
+ int backend_index = tensor_backend_id(node);
1602
+ if (backend_index == -1) {
1603
  return NULL;
1604
  }
1605
+ return sched->backends[backend_index];
1606
  }
1607
 
1608
  // utils
1609
 
1610
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1611
  GGML_ASSERT(tensor->buffer == NULL);
 
1612
  GGML_ASSERT(tensor->view_src != NULL);
1613
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1614
  GGML_ASSERT(tensor->view_src->data != NULL);
 
1632
  ggml_backend_buffer_init_tensor(buffer, tensor);
1633
  }
1634
 
1635
+ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1636
  struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1637
 
1638
  GGML_ASSERT(src != NULL);
 
1645
 
1646
  struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1647
  if (src->view_src != NULL) {
1648
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1649
  dst->view_offs = src->view_offs;
1650
  }
1651
  dst->op = src->op;
 
1658
  if (s == NULL) {
1659
  break;
1660
  }
1661
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1662
  }
1663
 
1664
  node_copies[id] = dst;
1665
  return dst;
1666
  }
1667
 
1668
+ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1669
  size_t id = ggml_hash_find(hash_set, src);
1670
  if (node_init[id]) {
1671
  return;
 
1674
 
1675
  struct ggml_tensor * dst = node_copies[id];
1676
  if (dst->view_src != NULL) {
1677
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1678
  ggml_backend_view_init(dst->view_src->buffer, dst);
1679
  }
1680
  else {
 
1687
  if (s == NULL) {
1688
  break;
1689
  }
1690
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1691
  }
1692
  }
1693
 
1694
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1695
  struct ggml_hash_set hash_set = {
1696
  /* .size = */ graph->visited_hash_table.size,
1697
+ /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1698
  };
1699
+ struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1700
+ bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1701
 
1702
  struct ggml_init_params params = {
1703
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
 
1726
  // dup nodes
1727
  for (int i = 0; i < graph->n_nodes; i++) {
1728
  struct ggml_tensor * node = graph->nodes[i];
1729
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1730
  }
1731
 
1732
  // allocate nodes
 
1751
  // copy data and init views
1752
  for (int i = 0; i < graph->n_nodes; i++) {
1753
  struct ggml_tensor * node = graph->nodes[i];
1754
+ graph_copy_init_tensor(hash_set, node_copies, node_init, node);
1755
  }
1756
 
1757
  // build graph copy
ggml-backend.h CHANGED
@@ -130,11 +130,7 @@ extern "C" {
130
 
131
  // in build_graph:
132
  build_graph(...) {
133
- // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
134
- alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
135
- ggml_allocr_alloc(alloc_cpu, tensor);
136
-
137
- // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
138
  struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
139
  ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
140
  }
@@ -164,20 +160,19 @@ extern "C" {
164
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
165
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
166
  // Initialize backend buffers from a measure graph
167
- GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
168
  // Get the number of splits of the last graph
169
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
170
 
171
- GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
172
- GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
173
 
174
  GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
175
  GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
176
 
177
  // Allocate and compute graph on the backend scheduler
178
- GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
179
 
180
- // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
181
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
182
 
183
  // Set a callback to be called for each resulting node during graph compute
 
130
 
131
  // in build_graph:
132
  build_graph(...) {
133
+ // manually assign nodes to a backend (optional, should not be needed in most cases)
 
 
 
 
134
  struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
135
  ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
136
  }
 
160
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
161
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
162
  // Initialize backend buffers from a measure graph
163
+ GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
164
  // Get the number of splits of the last graph
165
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
166
 
167
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
 
168
 
169
  GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
170
  GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
171
 
172
  // Allocate and compute graph on the backend scheduler
173
+ GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
174
 
175
+ // Reset all assignments and allocators - must be called before changing the node backends
176
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
177
 
178
  // Set a callback to be called for each resulting node during graph compute
ggml.c CHANGED
@@ -2607,7 +2607,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2607
  /*.nb =*/ { 0, 0, 0, 0 },
2608
  /*.op =*/ GGML_OP_NONE,
2609
  /*.op_params =*/ { 0 },
2610
- /*.is_param =*/ false,
2611
  /*.grad =*/ NULL,
2612
  /*.src =*/ { NULL },
2613
  /*.perf_runs =*/ 0,
@@ -6509,7 +6509,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
6509
  void ggml_set_param(
6510
  struct ggml_context * ctx,
6511
  struct ggml_tensor * tensor) {
6512
- tensor->is_param = true;
6513
 
6514
  GGML_ASSERT(tensor->grad == NULL);
6515
  tensor->grad = ggml_dup_tensor(ctx, tensor);
@@ -15311,7 +15311,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15311
  return NULL;
15312
  }
15313
 
15314
- if (node->is_param) {
15315
  return node;
15316
  }
15317
 
@@ -15345,7 +15345,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15345
 
15346
  clone->op = node->op;
15347
  clone->grad = node->grad;
15348
- clone->is_param = node->is_param;
15349
  clone->extra = node->extra;
15350
  for (int k = 0; k < GGML_MAX_DIMS; ++k) {
15351
  clone->nb[k] = node->nb[k];
@@ -16377,7 +16377,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
16377
  for (int i = 0; i < gf->n_nodes; i++) {
16378
  struct ggml_tensor * node = gf->nodes[i];
16379
 
16380
- if (node->is_param) {
16381
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16382
  ggml_build_forward_expand(gb, node->grad);
16383
  }
@@ -17862,7 +17862,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17862
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17863
  i,
17864
  node->ne[0], node->ne[1], node->ne[2],
17865
- ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17866
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17867
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17868
  (double) node->perf_time_us / 1000.0,
@@ -17955,7 +17955,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17955
  continue;
17956
  }
17957
 
17958
- if (node->is_param) {
17959
  snprintf(color, sizeof(color), "yellow");
17960
  } else if (node->grad) {
17961
  if (ggml_graph_find(gf, node)) {
@@ -18129,7 +18129,7 @@ static enum ggml_opt_result ggml_opt_adam(
18129
  int np = 0;
18130
  int64_t nx = 0;
18131
  for (int i = 0; i < gf->n_nodes; ++i) {
18132
- if (gf->nodes[i]->is_param) {
18133
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18134
 
18135
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18492,7 +18492,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18492
  int np = 0;
18493
  int nx = 0;
18494
  for (int i = 0; i < gf->n_nodes; ++i) {
18495
- if (gf->nodes[i]->is_param) {
18496
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18497
 
18498
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18967,6 +18967,16 @@ enum ggml_opt_result ggml_opt_resume_g(
18967
 
18968
  ////////////////////////////////////////////////////////////////////////////////
18969
 
 
 
 
 
 
 
 
 
 
 
18970
  void ggml_quantize_init(enum ggml_type type) {
18971
  ggml_critical_section_start();
18972
 
 
2607
  /*.nb =*/ { 0, 0, 0, 0 },
2608
  /*.op =*/ GGML_OP_NONE,
2609
  /*.op_params =*/ { 0 },
2610
+ /*.flags =*/ 0,
2611
  /*.grad =*/ NULL,
2612
  /*.src =*/ { NULL },
2613
  /*.perf_runs =*/ 0,
 
6509
  void ggml_set_param(
6510
  struct ggml_context * ctx,
6511
  struct ggml_tensor * tensor) {
6512
+ tensor->flags |= GGML_TENSOR_FLAG_PARAM;
6513
 
6514
  GGML_ASSERT(tensor->grad == NULL);
6515
  tensor->grad = ggml_dup_tensor(ctx, tensor);
 
15311
  return NULL;
15312
  }
15313
 
15314
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
15315
  return node;
15316
  }
15317
 
 
15345
 
15346
  clone->op = node->op;
15347
  clone->grad = node->grad;
15348
+ clone->flags = node->flags;
15349
  clone->extra = node->extra;
15350
  for (int k = 0; k < GGML_MAX_DIMS; ++k) {
15351
  clone->nb[k] = node->nb[k];
 
16377
  for (int i = 0; i < gf->n_nodes; i++) {
16378
  struct ggml_tensor * node = gf->nodes[i];
16379
 
16380
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
16381
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16382
  ggml_build_forward_expand(gb, node->grad);
16383
  }
 
17862
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17863
  i,
17864
  node->ne[0], node->ne[1], node->ne[2],
17865
+ ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
17866
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17867
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17868
  (double) node->perf_time_us / 1000.0,
 
17955
  continue;
17956
  }
17957
 
17958
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
17959
  snprintf(color, sizeof(color), "yellow");
17960
  } else if (node->grad) {
17961
  if (ggml_graph_find(gf, node)) {
 
18129
  int np = 0;
18130
  int64_t nx = 0;
18131
  for (int i = 0; i < gf->n_nodes; ++i) {
18132
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18133
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18134
 
18135
  GGML_ASSERT(np < GGML_MAX_PARAMS);
 
18492
  int np = 0;
18493
  int nx = 0;
18494
  for (int i = 0; i < gf->n_nodes; ++i) {
18495
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18496
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18497
 
18498
  GGML_ASSERT(np < GGML_MAX_PARAMS);
 
18967
 
18968
  ////////////////////////////////////////////////////////////////////////////////
18969
 
18970
+ void ggml_set_input(struct ggml_tensor * tensor) {
18971
+ tensor->flags |= GGML_TENSOR_FLAG_INPUT;
18972
+ }
18973
+
18974
+ void ggml_set_output(struct ggml_tensor * tensor) {
18975
+ tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
18976
+ }
18977
+
18978
+ ////////////////////////////////////////////////////////////////////////////////
18979
+
18980
  void ggml_quantize_init(enum ggml_type type) {
18981
  ggml_critical_section_start();
18982
 
ggml.h CHANGED
@@ -505,11 +505,17 @@ extern "C" {
505
 
506
  enum ggml_log_level {
507
  GGML_LOG_LEVEL_ERROR = 2,
508
- GGML_LOG_LEVEL_WARN = 3,
509
- GGML_LOG_LEVEL_INFO = 4,
510
  GGML_LOG_LEVEL_DEBUG = 5
511
  };
512
 
 
 
 
 
 
 
513
  // ggml object
514
  struct ggml_object {
515
  size_t offs;
@@ -543,7 +549,7 @@ extern "C" {
543
  // op params - allocated as int32_t for alignment
544
  int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
545
 
546
- bool is_param;
547
 
548
  struct ggml_tensor * grad;
549
  struct ggml_tensor * src[GGML_MAX_SRC];
@@ -2092,6 +2098,12 @@ extern "C" {
2092
  ggml_opt_callback callback,
2093
  void * callback_data);
2094
 
 
 
 
 
 
 
2095
  //
2096
  // quantization
2097
  //
 
505
 
506
  enum ggml_log_level {
507
  GGML_LOG_LEVEL_ERROR = 2,
508
+ GGML_LOG_LEVEL_WARN = 3,
509
+ GGML_LOG_LEVEL_INFO = 4,
510
  GGML_LOG_LEVEL_DEBUG = 5
511
  };
512
 
513
+ enum ggml_tensor_flag {
514
+ GGML_TENSOR_FLAG_INPUT = 1,
515
+ GGML_TENSOR_FLAG_OUTPUT = 2,
516
+ GGML_TENSOR_FLAG_PARAM = 4,
517
+ };
518
+
519
  // ggml object
520
  struct ggml_object {
521
  size_t offs;
 
549
  // op params - allocated as int32_t for alignment
550
  int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
551
 
552
+ int32_t flags;
553
 
554
  struct ggml_tensor * grad;
555
  struct ggml_tensor * src[GGML_MAX_SRC];
 
2098
  ggml_opt_callback callback,
2099
  void * callback_data);
2100
 
2101
+ //
2102
+ // tensor flags
2103
+ //
2104
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
2105
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
2106
+
2107
  //
2108
  // quantization
2109
  //
whisper.cpp CHANGED
@@ -471,52 +471,32 @@ struct whisper_pair {
471
 
472
  // ggml_allocr wrapper for whisper usage
473
  struct whisper_allocr {
474
- ggml_allocr * alloc = nullptr;
475
 
476
  std::vector<uint8_t> meta;
477
-
478
- ggml_backend_buffer_t buffer;
479
  };
480
 
481
  static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
482
- return allocr.meta.size() + ggml_allocr_max_size(allocr.alloc);
483
  }
484
 
485
  // measure the memory usage of a graph and prepare the allocr's internal data buffer
486
- static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
487
  auto & alloc = allocr.alloc;
488
  auto & meta = allocr.meta;
489
 
490
- alloc = ggml_allocr_new_measure_from_backend(backend);
491
 
492
  meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
493
 
494
- ggml_allocr_alloc_graph(alloc, get_graph());
495
- }
496
-
497
- static void whisper_allocr_graph_realloc(struct whisper_allocr & allocr, ggml_backend_t backend) {
498
- if (allocr.alloc == nullptr) {
499
- // this can be null if we use external encoder like CoreML or OpenVINO
500
- return;
501
- }
502
-
503
- auto & alloc = allocr.alloc;
504
- auto & buffer = allocr.buffer;
505
-
506
- size_t size = ggml_allocr_max_size(alloc);
507
-
508
- ggml_allocr_free(alloc);
509
-
510
- buffer = ggml_backend_alloc_buffer(backend, size);
511
- alloc = ggml_allocr_new_from_buffer(buffer);
512
- }
513
-
514
- static void whisper_allocr_free(struct whisper_allocr & allocr) {
515
- if (allocr.alloc) {
516
- ggml_allocr_free(allocr.alloc);
517
- ggml_backend_buffer_free(allocr.buffer);
518
- allocr.alloc = nullptr;
519
  }
 
520
  }
521
 
522
  // medium
@@ -658,9 +638,9 @@ struct whisper_kv_cache {
658
  struct ggml_tensor * k;
659
  struct ggml_tensor * v;
660
 
661
- struct ggml_context * ctx;
662
 
663
- ggml_backend_buffer_t buffer;
664
  };
665
 
666
  struct whisper_model {
@@ -698,10 +678,10 @@ struct whisper_model {
698
  std::vector<whisper_layer_decoder> layers_decoder;
699
 
700
  // ggml context that contains all the meta information about the model tensors
701
- struct ggml_context * ctx;
702
 
703
  // the model backend data is read-only and can be shared between processors
704
- std::vector<struct ggml_backend_buffer *> buffers;
705
 
706
  // tensors
707
  int n_loaded;
@@ -903,36 +883,26 @@ static bool kv_cache_init(
903
  cache.ctx = ggml_init(params);
904
 
905
  if (!cache.ctx) {
906
- WHISPER_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
907
  return false;
908
  }
909
 
910
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
911
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
912
 
913
- const size_t mem_bytes = ggml_nbytes(cache.k) + ggml_nbytes(cache.v);
914
-
915
- cache.buffer = ggml_backend_alloc_buffer(backend, mem_bytes);
916
-
917
- // allocate the tensors into the backend buffer
918
- {
919
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(cache.buffer);
920
-
921
- ggml_allocr_alloc(alloc, cache.k);
922
- ggml_allocr_alloc(alloc, cache.v);
923
-
924
- ggml_allocr_free(alloc);
925
  }
926
 
927
  return true;
928
  }
929
 
930
  static void kv_cache_free(struct whisper_kv_cache & cache) {
931
- if (cache.ctx) {
932
- ggml_free(cache.ctx);
933
- ggml_backend_buffer_free(cache.buffer);
934
- cache.ctx = nullptr;
935
- }
936
  }
937
 
938
  static bool whisper_kv_cache_find_slot(
@@ -1513,68 +1483,21 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1513
  }
1514
 
1515
  wctx.backend = whisper_backend_init(wctx.params);
1516
-
1517
- // some devices have a limit on the maximum size of single memory buffer
1518
- // for example, iPhones are limited to 1GB per buffer
1519
- // to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
1520
- // model weights between them
1521
- //
1522
- // the map_t2b maps tensor names to buffer indices
1523
- // as we iterate over the tensors, we will allocate new buffers when the current one is full
1524
- //
1525
- // finally, we create a separate allocator for each buffer and use it to allocate the tensors
1526
- // we keep the allocators alive until all the tensors are loaded
1527
-
1528
- GGML_ASSERT(model.buffers.empty());
1529
-
1530
- std::map<std::string, int> map_t2b;
1531
-
1532
- {
1533
- size_t size_main = 0;
1534
- size_t size_cur = 0;
1535
-
1536
- static const size_t GB = 1024ull*1024ull*1024ull;
1537
-
1538
- for (const auto & t : model.tensors) {
1539
- const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
1540
-
1541
- // adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
1542
- if (size_cur + cur > GB) {
1543
- GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
1544
-
1545
- model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
1546
-
1547
- size_cur = cur;
1548
- }
1549
-
1550
- map_t2b[t.first] = model.buffers.size();
1551
-
1552
- size_cur += cur;
1553
- size_main += cur;
1554
- }
1555
-
1556
- // allocate the last buffer if needed
1557
- if (size_cur > 0) {
1558
- model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
1559
- }
1560
-
1561
- GGML_ASSERT(model.buffers.size() > 0);
1562
-
1563
- WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
1564
- }
1565
-
1566
- std::vector<ggml_allocr *> allocs(model.buffers.size());
1567
- for (size_t i = 0; i < allocs.size(); ++i) {
1568
- allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
1569
  }
1570
 
1571
  // allocate tensors in the backend buffers
1572
- {
1573
- for (const auto & t : model.tensors) {
1574
- ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
1575
- }
1576
  }
1577
 
 
 
 
1578
  // load weights
1579
  {
1580
  size_t total_size = 0;
@@ -1636,15 +1559,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1636
  return false;
1637
  }
1638
 
1639
- ggml_backend_t backend = wctx.backend;
1640
 
1641
  //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
1642
 
1643
- if ((ggml_backend_is_cpu(backend)
1644
- #ifdef GGML_USE_METAL
1645
- || ggml_backend_is_metal(backend)
1646
- #endif
1647
- )) {
1648
  // for the CPU and Metal backend, we can read directly into the tensor
1649
  loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
1650
  BYTESWAP_TENSOR(tensor);
@@ -1672,10 +1591,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1672
  }
1673
  }
1674
 
1675
- for (auto & alloc : allocs) {
1676
- ggml_allocr_free(alloc);
1677
- }
1678
-
1679
  wctx.t_load_us = ggml_time_us() - t_start_us;
1680
 
1681
  return true;
@@ -1704,7 +1619,6 @@ static struct ggml_cgraph * whisper_build_graph_conv(
1704
  whisper_state & wstate,
1705
  const int mel_offset) {
1706
  const auto & model = wctx.model;
1707
- const auto & mel_inp = wstate.mel;
1708
  const auto & hparams = model.hparams;
1709
 
1710
  const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
@@ -1722,31 +1636,9 @@ static struct ggml_cgraph * whisper_build_graph_conv(
1722
 
1723
  ggml_cgraph * gf = ggml_new_graph(ctx0);
1724
 
1725
- ggml_allocr * alloc = wstate.alloc_conv.alloc;
1726
-
1727
  struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
1728
- ggml_allocr_alloc(alloc, mel);
1729
-
1730
- assert(mel->type == GGML_TYPE_F32);
1731
- if (!ggml_allocr_is_measure(alloc)) {
1732
- assert(mel_inp.n_mel == n_mels);
1733
-
1734
- wstate.inp_mel.resize(ggml_nelements(mel));
1735
-
1736
- float * dst = wstate.inp_mel.data();
1737
- memset(dst, 0, ggml_nbytes(mel));
1738
-
1739
- const int i0 = std::min(mel_offset, mel_inp.n_len);
1740
- const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
1741
-
1742
- for (int j = 0; j < mel_inp.n_mel; ++j) {
1743
- for (int i = i0; i < i1; ++i) {
1744
- dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
1745
- }
1746
- }
1747
-
1748
- ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
1749
- }
1750
 
1751
  struct ggml_tensor * cur = nullptr;
1752
 
@@ -2138,11 +2030,39 @@ static bool whisper_encode_internal(
2138
  {
2139
  auto & alloc = wstate.alloc_conv.alloc;
2140
 
2141
- ggml_allocr_reset(alloc);
2142
-
2143
  ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
2144
 
2145
- ggml_allocr_alloc_graph(alloc, gf);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2146
 
2147
  if (!whisper_encode_external(wstate)) {
2148
  if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
@@ -2155,11 +2075,12 @@ static bool whisper_encode_internal(
2155
  if (!whisper_encode_external(wstate)) {
2156
  auto & alloc = wstate.alloc_encode.alloc;
2157
 
2158
- ggml_allocr_reset(alloc);
2159
-
2160
  ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
2161
 
2162
- ggml_allocr_alloc_graph(alloc, gf);
 
 
 
2163
 
2164
  if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
2165
  return false;
@@ -2170,11 +2091,12 @@ static bool whisper_encode_internal(
2170
  {
2171
  auto & alloc = wstate.alloc_cross.alloc;
2172
 
2173
- ggml_allocr_reset(alloc);
2174
-
2175
  ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
2176
 
2177
- ggml_allocr_alloc_graph(alloc, gf);
 
 
 
2178
 
2179
  if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
2180
  return false;
@@ -2190,7 +2112,8 @@ static bool whisper_encode_internal(
2190
  static struct ggml_cgraph * whisper_build_graph_decoder(
2191
  whisper_context & wctx,
2192
  whisper_state & wstate,
2193
- const whisper_batch & batch) {
 
2194
  const auto & model = wctx.model;
2195
  const auto & hparams = model.hparams;
2196
 
@@ -2198,8 +2121,6 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
2198
 
2199
  WHISPER_ASSERT(!!kv_self.ctx);
2200
 
2201
- ggml_allocr * alloc = wstate.alloc_decode.alloc;
2202
-
2203
  const int n_ctx = kv_self.size;
2204
  const int n_state = hparams.n_text_state;
2205
  const int n_head = hparams.n_text_head;
@@ -2208,8 +2129,8 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
2208
  const int n_tokens = batch.n_tokens;
2209
  const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
2210
 
2211
- const int32_t n_kv = ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n;
2212
- const int32_t kv_head = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
2213
 
2214
  //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
2215
 
@@ -2224,48 +2145,18 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
2224
  ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
2225
 
2226
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2227
- ggml_allocr_alloc(alloc, embd);
2228
-
2229
- if (!ggml_allocr_is_measure(alloc)) {
2230
- ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
2231
- }
2232
 
2233
  struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2234
- ggml_allocr_alloc(alloc, position);
2235
-
2236
- if (!ggml_allocr_is_measure(alloc)) {
2237
- for (int i = 0; i < n_tokens; ++i) {
2238
- const int32_t val = batch.pos[i];
2239
- ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
2240
- }
2241
- }
2242
 
2243
  const float KQscale = pow(float(n_state)/n_head, -0.25);
2244
 
2245
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2246
- ggml_allocr_alloc(alloc, KQ_mask);
2247
-
2248
- if (!ggml_allocr_is_measure(alloc)) {
2249
- wstate.inp_mask.resize(n_kv*n_tokens);
2250
-
2251
- float * data = wstate.inp_mask.data();
2252
- memset(data, 0, ggml_nbytes(KQ_mask));
2253
-
2254
- for (int h = 0; h < 1; ++h) {
2255
- for (int j = 0; j < n_tokens; ++j) {
2256
- const whisper_pos pos = batch.pos[j];
2257
- const whisper_seq_id seq_id = batch.seq_id[j][0];
2258
-
2259
- for (int i = 0; i < n_kv; ++i) {
2260
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2261
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2262
- }
2263
- }
2264
- }
2265
- }
2266
-
2267
- ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
2268
- }
2269
 
2270
  // token encoding + position encoding
2271
  struct ggml_tensor * cur =
@@ -2592,11 +2483,53 @@ static bool whisper_decode_internal(
2592
  {
2593
  auto & alloc = wstate.alloc_decode.alloc;
2594
 
2595
- ggml_allocr_reset(alloc);
2596
 
2597
- ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch);
 
 
 
2598
 
2599
- ggml_allocr_alloc_graph(alloc, gf);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2600
 
2601
  logits = gf->nodes[gf->n_nodes - 1];
2602
 
@@ -3046,6 +2979,11 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3046
  whisper_state * state = new whisper_state;
3047
 
3048
  state->backend = whisper_backend_init(ctx->params);
 
 
 
 
 
3049
 
3050
  // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
3051
  // in theory, there can be a case where this is not enough, but in practice it should always be enough
@@ -3053,7 +2991,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3053
 
3054
  if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
3055
  WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
3056
- delete state;
3057
  return nullptr;
3058
  }
3059
 
@@ -3064,7 +3002,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3064
 
3065
  if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
3066
  WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
3067
- delete state;
3068
  return nullptr;
3069
  }
3070
 
@@ -3083,7 +3021,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3083
  if (!state->ctx_coreml) {
3084
  WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
3085
  #ifndef WHISPER_COREML_ALLOW_FALLBACK
3086
- delete state;
3087
  return nullptr;
3088
  #endif
3089
  } else {
@@ -3107,37 +3045,55 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3107
 
3108
  // conv allocator
3109
  {
3110
- whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
3111
  [&]() {
3112
  return whisper_build_graph_conv(*ctx, *state, 0);
3113
  });
3114
 
 
 
 
 
 
 
3115
  WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
3116
  }
3117
 
3118
  // encoder allocator
3119
  if (!whisper_encode_external(*state)) {
3120
- whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
3121
  [&]() {
3122
  return whisper_build_graph_encoder(*ctx, *state);
3123
  });
3124
 
 
 
 
 
 
 
3125
  WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
3126
  }
3127
 
3128
  // cross allocator
3129
  {
3130
- whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
3131
  [&]() {
3132
  return whisper_build_graph_cross(*ctx, *state);
3133
  });
3134
 
 
 
 
 
 
 
3135
  WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
3136
  }
3137
 
3138
  // decoder allocator
3139
  {
3140
- whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
3141
  [&]() {
3142
  const auto & hparams = ctx->model.hparams;
3143
 
@@ -3147,17 +3103,18 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3147
 
3148
  whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
3149
 
3150
- return whisper_build_graph_decoder(*ctx, *state, state->batch);
3151
  });
3152
 
 
 
 
 
 
 
3153
  WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
3154
  }
3155
 
3156
- whisper_allocr_graph_realloc(state->alloc_conv, ctx->backend);
3157
- whisper_allocr_graph_realloc(state->alloc_encode, ctx->backend);
3158
- whisper_allocr_graph_realloc(state->alloc_cross, ctx->backend);
3159
- whisper_allocr_graph_realloc(state->alloc_decode, ctx->backend);
3160
-
3161
  return state;
3162
  }
3163
 
@@ -3380,8 +3337,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa
3380
  return whisper_init_with_params_no_state(loader, whisper_context_default_params());
3381
  }
3382
 
3383
- void whisper_free_state(struct whisper_state * state)
3384
- {
3385
  if (state) {
3386
  kv_cache_free(state->kv_self);
3387
  kv_cache_free(state->kv_cross);
@@ -3402,10 +3358,10 @@ void whisper_free_state(struct whisper_state * state)
3402
 
3403
  whisper_batch_free(state->batch);
3404
 
3405
- whisper_allocr_free(state->alloc_conv);
3406
- whisper_allocr_free(state->alloc_encode);
3407
- whisper_allocr_free(state->alloc_cross);
3408
- whisper_allocr_free(state->alloc_decode);
3409
 
3410
  ggml_backend_free(state->backend);
3411
 
@@ -3415,15 +3371,9 @@ void whisper_free_state(struct whisper_state * state)
3415
 
3416
  void whisper_free(struct whisper_context * ctx) {
3417
  if (ctx) {
3418
- if (ctx->model.ctx) {
3419
- ggml_free(ctx->model.ctx);
3420
- }
3421
 
3422
- for (auto & buffer : ctx->model.buffers) {
3423
- if (buffer) {
3424
- ggml_backend_buffer_free(buffer);
3425
- }
3426
- }
3427
 
3428
  whisper_free_state(ctx->state);
3429
 
 
471
 
472
  // ggml_allocr wrapper for whisper usage
473
  struct whisper_allocr {
474
+ ggml_gallocr_t alloc = nullptr;
475
 
476
  std::vector<uint8_t> meta;
 
 
477
  };
478
 
479
  static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
480
+ return allocr.meta.size() + ggml_gallocr_get_buffer_size(allocr.alloc, 0);
481
  }
482
 
483
  // measure the memory usage of a graph and prepare the allocr's internal data buffer
484
+ static bool whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
485
  auto & alloc = allocr.alloc;
486
  auto & meta = allocr.meta;
487
 
488
+ alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
489
 
490
  meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
491
 
492
+ // since there are dependencies between the different graphs,
493
+ // we need to allocate them instead of only reserving to get the correct compute buffer size
494
+ if (!ggml_gallocr_alloc_graph(alloc, get_graph())) {
495
+ // failed to allocate the compute buffer
496
+ WHISPER_LOG_ERROR("%s: failed to allocate the compute buffer\n", __func__);
497
+ return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
  }
499
+ return true;
500
  }
501
 
502
  // medium
 
638
  struct ggml_tensor * k;
639
  struct ggml_tensor * v;
640
 
641
+ struct ggml_context * ctx = nullptr;
642
 
643
+ ggml_backend_buffer_t buffer = nullptr;
644
  };
645
 
646
  struct whisper_model {
 
678
  std::vector<whisper_layer_decoder> layers_decoder;
679
 
680
  // ggml context that contains all the meta information about the model tensors
681
+ struct ggml_context * ctx = nullptr;
682
 
683
  // the model backend data is read-only and can be shared between processors
684
+ ggml_backend_buffer_t buffer = nullptr;
685
 
686
  // tensors
687
  int n_loaded;
 
883
  cache.ctx = ggml_init(params);
884
 
885
  if (!cache.ctx) {
886
+ WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
887
  return false;
888
  }
889
 
890
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
891
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
892
 
893
+ cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend);
894
+ if (!cache.buffer) {
895
+ WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
896
+ return false;
 
 
 
 
 
 
 
 
897
  }
898
 
899
  return true;
900
  }
901
 
902
  static void kv_cache_free(struct whisper_kv_cache & cache) {
903
+ ggml_free(cache.ctx);
904
+ ggml_backend_buffer_free(cache.buffer);
905
+ cache.ctx = nullptr;
 
 
906
  }
907
 
908
  static bool whisper_kv_cache_find_slot(
 
1483
  }
1484
 
1485
  wctx.backend = whisper_backend_init(wctx.params);
1486
+ if (!wctx.backend) {
1487
+ WHISPER_LOG_ERROR("%s: failed to initialize the backend\n", __func__);
1488
+ return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1489
  }
1490
 
1491
  // allocate tensors in the backend buffers
1492
+ model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, wctx.backend);
1493
+ if (!model.buffer) {
1494
+ WHISPER_LOG_ERROR("%s: failed to allocate memory for the model\n", __func__);
1495
+ return false;
1496
  }
1497
 
1498
+ size_t size_main = ggml_backend_buffer_get_size(model.buffer);
1499
+ WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6);
1500
+
1501
  // load weights
1502
  {
1503
  size_t total_size = 0;
 
1559
  return false;
1560
  }
1561
 
1562
+ //ggml_backend_t backend = wctx.backend;
1563
 
1564
  //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
1565
 
1566
+ if (ggml_backend_buffer_is_host(model.buffer)) {
 
 
 
 
1567
  // for the CPU and Metal backend, we can read directly into the tensor
1568
  loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
1569
  BYTESWAP_TENSOR(tensor);
 
1591
  }
1592
  }
1593
 
 
 
 
 
1594
  wctx.t_load_us = ggml_time_us() - t_start_us;
1595
 
1596
  return true;
 
1619
  whisper_state & wstate,
1620
  const int mel_offset) {
1621
  const auto & model = wctx.model;
 
1622
  const auto & hparams = model.hparams;
1623
 
1624
  const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
 
1636
 
1637
  ggml_cgraph * gf = ggml_new_graph(ctx0);
1638
 
 
 
1639
  struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
1640
+ ggml_set_name(mel, "mel");
1641
+ ggml_set_input(mel);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1642
 
1643
  struct ggml_tensor * cur = nullptr;
1644
 
 
2030
  {
2031
  auto & alloc = wstate.alloc_conv.alloc;
2032
 
 
 
2033
  ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
2034
 
2035
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
2036
+ // should never happen as we pre-allocate the memory
2037
+ return false;
2038
+ }
2039
+
2040
+ // set the input
2041
+ {
2042
+ const auto & mel_inp = wstate.mel;
2043
+ const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx;
2044
+
2045
+ struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
2046
+
2047
+ assert(mel->type == GGML_TYPE_F32);
2048
+ assert(mel_inp.n_mel == wctx.model.hparams.n_mels);
2049
+
2050
+ wstate.inp_mel.resize(ggml_nelements(mel));
2051
+
2052
+ float * dst = wstate.inp_mel.data();
2053
+ memset(dst, 0, ggml_nbytes(mel));
2054
+
2055
+ const int i0 = std::min(mel_offset, mel_inp.n_len);
2056
+ const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
2057
+
2058
+ for (int j = 0; j < mel_inp.n_mel; ++j) {
2059
+ for (int i = i0; i < i1; ++i) {
2060
+ dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
2061
+ }
2062
+ }
2063
+
2064
+ ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
2065
+ }
2066
 
2067
  if (!whisper_encode_external(wstate)) {
2068
  if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
 
2075
  if (!whisper_encode_external(wstate)) {
2076
  auto & alloc = wstate.alloc_encode.alloc;
2077
 
 
 
2078
  ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
2079
 
2080
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
2081
+ // should never happen as we pre-allocate the memory
2082
+ return false;
2083
+ }
2084
 
2085
  if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
2086
  return false;
 
2091
  {
2092
  auto & alloc = wstate.alloc_cross.alloc;
2093
 
 
 
2094
  ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
2095
 
2096
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
2097
+ // should never happen as we pre-allocate the memory
2098
+ return false;
2099
+ }
2100
 
2101
  if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
2102
  return false;
 
2112
  static struct ggml_cgraph * whisper_build_graph_decoder(
2113
  whisper_context & wctx,
2114
  whisper_state & wstate,
2115
+ const whisper_batch & batch,
2116
+ bool worst_case) {
2117
  const auto & model = wctx.model;
2118
  const auto & hparams = model.hparams;
2119
 
 
2121
 
2122
  WHISPER_ASSERT(!!kv_self.ctx);
2123
 
 
 
2124
  const int n_ctx = kv_self.size;
2125
  const int n_state = hparams.n_text_state;
2126
  const int n_head = hparams.n_text_head;
 
2129
  const int n_tokens = batch.n_tokens;
2130
  const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
2131
 
2132
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
2133
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
2134
 
2135
  //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
2136
 
 
2145
  ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
2146
 
2147
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2148
+ ggml_set_name(embd, "embd");
2149
+ ggml_set_input(embd);
 
 
 
2150
 
2151
  struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2152
+ ggml_set_name(position, "position");
2153
+ ggml_set_input(position);
 
 
 
 
 
 
2154
 
2155
  const float KQscale = pow(float(n_state)/n_head, -0.25);
2156
 
2157
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2158
+ ggml_set_name(KQ_mask, "KQ_mask");
2159
+ ggml_set_input(KQ_mask);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2160
 
2161
  // token encoding + position encoding
2162
  struct ggml_tensor * cur =
 
2483
  {
2484
  auto & alloc = wstate.alloc_decode.alloc;
2485
 
2486
+ ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch, false);
2487
 
2488
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
2489
+ // should never happen as we pre-allocate the memory
2490
+ return false;
2491
+ }
2492
 
2493
+ // set the inputs
2494
+ {
2495
+ struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
2496
+ ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
2497
+ }
2498
+
2499
+ {
2500
+ struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
2501
+ for (int i = 0; i < n_tokens; ++i) {
2502
+ const int32_t val = batch.pos[i];
2503
+ ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
2504
+ }
2505
+ }
2506
+
2507
+ {
2508
+ struct ggml_tensor * KQ_mask = ggml_graph_get_tensor(gf, "KQ_mask");
2509
+
2510
+ auto & kv_self = wstate.kv_self;
2511
+ const int32_t n_kv = kv_self.n;
2512
+
2513
+ wstate.inp_mask.resize(n_kv*n_tokens);
2514
+
2515
+ float * data = wstate.inp_mask.data();
2516
+ memset(data, 0, ggml_nbytes(KQ_mask));
2517
+
2518
+ for (int h = 0; h < 1; ++h) {
2519
+ for (int j = 0; j < n_tokens; ++j) {
2520
+ const whisper_pos pos = batch.pos[j];
2521
+ const whisper_seq_id seq_id = batch.seq_id[j][0];
2522
+
2523
+ for (int i = 0; i < n_kv; ++i) {
2524
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2525
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2526
+ }
2527
+ }
2528
+ }
2529
+ }
2530
+
2531
+ ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
2532
+ }
2533
 
2534
  logits = gf->nodes[gf->n_nodes - 1];
2535
 
 
2979
  whisper_state * state = new whisper_state;
2980
 
2981
  state->backend = whisper_backend_init(ctx->params);
2982
+ if (!state->backend) {
2983
+ WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__);
2984
+ whisper_free_state(state);
2985
+ return nullptr;
2986
+ }
2987
 
2988
  // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
2989
  // in theory, there can be a case where this is not enough, but in practice it should always be enough
 
2991
 
2992
  if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
2993
  WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
2994
+ whisper_free_state(state);
2995
  return nullptr;
2996
  }
2997
 
 
3002
 
3003
  if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
3004
  WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
3005
+ whisper_free_state(state);
3006
  return nullptr;
3007
  }
3008
 
 
3021
  if (!state->ctx_coreml) {
3022
  WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
3023
  #ifndef WHISPER_COREML_ALLOW_FALLBACK
3024
+ whisper_free_state(state);
3025
  return nullptr;
3026
  #endif
3027
  } else {
 
3045
 
3046
  // conv allocator
3047
  {
3048
+ bool ok = whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
3049
  [&]() {
3050
  return whisper_build_graph_conv(*ctx, *state, 0);
3051
  });
3052
 
3053
+ if (!ok) {
3054
+ WHISPER_LOG_ERROR("%s: failed to init conv allocator\n", __func__);
3055
+ whisper_free_state(state);
3056
+ return nullptr;
3057
+ }
3058
+
3059
  WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
3060
  }
3061
 
3062
  // encoder allocator
3063
  if (!whisper_encode_external(*state)) {
3064
+ bool ok = whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
3065
  [&]() {
3066
  return whisper_build_graph_encoder(*ctx, *state);
3067
  });
3068
 
3069
+ if (!ok) {
3070
+ WHISPER_LOG_ERROR("%s: failed to init encoder allocator\n", __func__);
3071
+ whisper_free_state(state);
3072
+ return nullptr;
3073
+ }
3074
+
3075
  WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
3076
  }
3077
 
3078
  // cross allocator
3079
  {
3080
+ bool ok = whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
3081
  [&]() {
3082
  return whisper_build_graph_cross(*ctx, *state);
3083
  });
3084
 
3085
+ if (!ok) {
3086
+ WHISPER_LOG_ERROR("%s: failed to init cross allocator\n", __func__);
3087
+ whisper_free_state(state);
3088
+ return nullptr;
3089
+ }
3090
+
3091
  WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
3092
  }
3093
 
3094
  // decoder allocator
3095
  {
3096
+ bool ok = whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
3097
  [&]() {
3098
  const auto & hparams = ctx->model.hparams;
3099
 
 
3103
 
3104
  whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
3105
 
3106
+ return whisper_build_graph_decoder(*ctx, *state, state->batch, true);
3107
  });
3108
 
3109
+ if (!ok) {
3110
+ WHISPER_LOG_ERROR("%s: failed to init decoder allocator\n", __func__);
3111
+ whisper_free_state(state);
3112
+ return nullptr;
3113
+ }
3114
+
3115
  WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
3116
  }
3117
 
 
 
 
 
 
3118
  return state;
3119
  }
3120
 
 
3337
  return whisper_init_with_params_no_state(loader, whisper_context_default_params());
3338
  }
3339
 
3340
+ void whisper_free_state(struct whisper_state * state) {
 
3341
  if (state) {
3342
  kv_cache_free(state->kv_self);
3343
  kv_cache_free(state->kv_cross);
 
3358
 
3359
  whisper_batch_free(state->batch);
3360
 
3361
+ ggml_gallocr_free(state->alloc_conv.alloc);
3362
+ ggml_gallocr_free(state->alloc_encode.alloc);
3363
+ ggml_gallocr_free(state->alloc_cross.alloc);
3364
+ ggml_gallocr_free(state->alloc_decode.alloc);
3365
 
3366
  ggml_backend_free(state->backend);
3367
 
 
3371
 
3372
  void whisper_free(struct whisper_context * ctx) {
3373
  if (ctx) {
3374
+ ggml_free(ctx->model.ctx);
 
 
3375
 
3376
+ ggml_backend_buffer_free(ctx->model.buffer);
 
 
 
 
3377
 
3378
  whisper_free_state(ctx->state);
3379