jartine commited on
Commit
7815f68
·
unverified ·
1 Parent(s): 95f6502

ggml : introduce GGML_CALL function annotation (llama/4850)

Browse files

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

Files changed (9) hide show
  1. ggml-backend-impl.h +30 -30
  2. ggml-backend.c +40 -40
  3. ggml-backend.h +25 -25
  4. ggml-cuda.cu +60 -61
  5. ggml-cuda.h +16 -16
  6. ggml-metal.h +2 -2
  7. ggml-metal.m +21 -21
  8. ggml.c +16 -16
  9. ggml.h +34 -24
ggml-backend-impl.h CHANGED
@@ -16,14 +16,14 @@ extern "C" {
16
  typedef void * ggml_backend_buffer_type_context_t;
17
 
18
  struct ggml_backend_buffer_type_i {
19
- const char * (*get_name) (ggml_backend_buffer_type_t buft);
20
- ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
- size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
- size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
- bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
  // check if tensor data is in host memory
25
  // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
26
- bool (*is_host) (ggml_backend_buffer_type_t buft);
27
  };
28
 
29
  struct ggml_backend_buffer_type {
@@ -35,15 +35,15 @@ extern "C" {
35
  typedef void * ggml_backend_buffer_context_t;
36
 
37
  struct ggml_backend_buffer_i {
38
- const char * (*get_name) (ggml_backend_buffer_t buffer);
39
- void (*free_buffer)(ggml_backend_buffer_t buffer);
40
- void * (*get_base) (ggml_backend_buffer_t buffer);
41
- void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
42
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
43
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
44
- bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
45
- void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
46
- void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
47
  };
48
 
49
  struct ggml_backend_buffer {
@@ -54,7 +54,7 @@ extern "C" {
54
  enum ggml_backend_buffer_usage usage;
55
  };
56
 
57
- ggml_backend_buffer_t ggml_backend_buffer_init(
58
  ggml_backend_buffer_type_t buft,
59
  struct ggml_backend_buffer_i iface,
60
  ggml_backend_buffer_context_t context,
@@ -70,31 +70,31 @@ extern "C" {
70
  typedef void * ggml_backend_context_t;
71
 
72
  struct ggml_backend_i {
73
- const char * (*get_name)(ggml_backend_t backend);
74
 
75
- void (*free)(ggml_backend_t backend);
76
 
77
  // buffer allocation
78
- ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
79
 
80
  // (optional) asynchronous tensor data access
81
- void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
82
- void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
83
- bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
84
 
85
  // (optional) complete all pending operations
86
- void (*synchronize)(ggml_backend_t backend);
87
 
88
  // compute graph with a plan
89
- ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
90
- void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
- void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
92
 
93
  // compute graph without a plan (async)
94
- bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
95
 
96
  // check if the backend supports an operation
97
- bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
98
  };
99
 
100
  struct ggml_backend {
@@ -107,9 +107,9 @@ extern "C" {
107
  // Backend registry
108
  //
109
 
110
- typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
111
 
112
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
113
 
114
  #ifdef __cplusplus
115
  }
 
16
  typedef void * ggml_backend_buffer_type_context_t;
17
 
18
  struct ggml_backend_buffer_type_i {
19
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
+ ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
+ bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
  // check if tensor data is in host memory
25
  // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
26
+ bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
27
  };
28
 
29
  struct ggml_backend_buffer_type {
 
35
  typedef void * ggml_backend_buffer_context_t;
36
 
37
  struct ggml_backend_buffer_i {
38
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
39
+ void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
40
+ void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
41
+ void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
42
+ void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
43
+ void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
44
+ bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
45
+ void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
46
+ void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
47
  };
48
 
49
  struct ggml_backend_buffer {
 
54
  enum ggml_backend_buffer_usage usage;
55
  };
56
 
57
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
58
  ggml_backend_buffer_type_t buft,
59
  struct ggml_backend_buffer_i iface,
60
  ggml_backend_buffer_context_t context,
 
70
  typedef void * ggml_backend_context_t;
71
 
72
  struct ggml_backend_i {
73
+ const char * (*GGML_CALL get_name)(ggml_backend_t backend);
74
 
75
+ void (*GGML_CALL free)(ggml_backend_t backend);
76
 
77
  // buffer allocation
78
+ ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
79
 
80
  // (optional) asynchronous tensor data access
81
+ void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
82
+ void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
83
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
84
 
85
  // (optional) complete all pending operations
86
+ void (*GGML_CALL synchronize)(ggml_backend_t backend);
87
 
88
  // compute graph with a plan
89
+ ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
90
+ void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
+ void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
92
 
93
  // compute graph without a plan (async)
94
+ bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
95
 
96
  // check if the backend supports an operation
97
+ bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
98
  };
99
 
100
  struct ggml_backend {
 
107
  // Backend registry
108
  //
109
 
110
+ typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
111
 
112
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
113
 
114
  #ifdef __cplusplus
115
  }
ggml-backend.c CHANGED
@@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
19
  return buft->iface.get_name(buft);
20
  }
21
 
22
- ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
23
  return buft->iface.alloc_buffer(buft, size);
24
  }
25
 
@@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
27
  return buft->iface.get_alignment(buft);
28
  }
29
 
30
- size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
  // get_alloc_size is optional, defaults to ggml_nbytes
32
  if (buft->iface.get_alloc_size) {
33
  return buft->iface.get_alloc_size(buft, tensor);
@@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
48
 
49
  // backend buffer
50
 
51
- ggml_backend_buffer_t ggml_backend_buffer_init(
52
  ggml_backend_buffer_type_t buft,
53
  struct ggml_backend_buffer_i iface,
54
  ggml_backend_buffer_context_t context,
@@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
95
  return base;
96
  }
97
 
98
- void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
99
  // init_tensor is optional
100
  if (buffer->iface.init_tensor) {
101
  buffer->iface.init_tensor(buffer, tensor);
@@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
191
  }
192
  }
193
 
194
- void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
195
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
196
 
197
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
201
  tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
202
  }
203
 
204
- void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
205
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
206
 
207
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -318,9 +318,9 @@ struct ggml_backend_reg {
318
  static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
319
  static size_t ggml_backend_registry_count = 0;
320
 
321
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
322
 
323
- static void ggml_backend_registry_init(void) {
324
  static bool initialized = false;
325
 
326
  if (initialized) {
@@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) {
333
 
334
  // add forward decls here to avoid including the backend headers
335
  #ifdef GGML_USE_CUBLAS
336
- extern void ggml_backend_cuda_reg_devices(void);
337
  ggml_backend_cuda_reg_devices();
338
  #endif
339
 
340
  #ifdef GGML_USE_METAL
341
- extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
- extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
343
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
344
  #endif
345
  }
346
 
347
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
348
  GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
349
 
350
  size_t id = ggml_backend_registry_count;
@@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
439
 
440
  // backend CPU
441
 
442
- static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
443
  return "CPU";
444
 
445
  GGML_UNUSED(buffer);
446
  }
447
 
448
- static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
449
  return (void *)buffer->context;
450
  }
451
 
452
- static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
453
  free(buffer->context);
454
  }
455
 
456
- static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
457
  memcpy((char *)tensor->data + offset, data, size);
458
 
459
  GGML_UNUSED(buffer);
460
  }
461
 
462
- static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
463
  memcpy(data, (const char *)tensor->data + offset, size);
464
 
465
  GGML_UNUSED(buffer);
466
  }
467
 
468
- static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
469
  if (ggml_backend_buffer_is_host(src->buffer)) {
470
  memcpy(dst->data, src->data, ggml_nbytes(src));
471
  return true;
@@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
475
  GGML_UNUSED(buffer);
476
  }
477
 
478
- static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
479
  memset(buffer->context, value, buffer->size);
480
  }
481
 
@@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
506
 
507
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
508
 
509
- static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
510
  return "CPU";
511
 
512
  GGML_UNUSED(buft);
513
  }
514
 
515
- static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
516
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
517
  void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
518
 
@@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
521
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
522
  }
523
 
524
- static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
525
  return TENSOR_ALIGNMENT;
526
 
527
  GGML_UNUSED(buft);
528
  }
529
 
530
- static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
531
  return ggml_backend_is_cpu(backend);
532
 
533
  GGML_UNUSED(buft);
534
  }
535
 
536
- static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
537
  return true;
538
 
539
  GGML_UNUSED(buft);
540
  }
541
 
542
- ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
543
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
544
  /* .iface = */ {
545
  /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
@@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
561
 
562
  #include <hbwmalloc.h>
563
 
564
- static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
565
  return "CPU_HBM";
566
 
567
  GGML_UNUSED(buft);
568
  }
569
 
570
- static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
571
  return "CPU_HBM";
572
 
573
  GGML_UNUSED(buf);
574
  }
575
 
576
- static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
577
  hbw_free(buffer->context);
578
  }
579
 
580
- static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
581
  //void * ptr = hbw_malloc(size);
582
  void * ptr;
583
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -617,20 +617,20 @@ struct ggml_backend_cpu_context {
617
  size_t work_size;
618
  };
619
 
620
- static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
621
  return "CPU";
622
 
623
  GGML_UNUSED(backend);
624
  }
625
 
626
- static void ggml_backend_cpu_free(ggml_backend_t backend) {
627
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
628
  free(cpu_ctx->work_data);
629
  free(cpu_ctx);
630
  free(backend);
631
  }
632
 
633
- static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
634
  return ggml_backend_cpu_buffer_type();
635
 
636
  GGML_UNUSED(backend);
@@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu {
641
  struct ggml_cgraph cgraph;
642
  };
643
 
644
- static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
645
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
646
 
647
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
656
  return cpu_plan;
657
  }
658
 
659
- static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
660
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
661
 
662
  free(cpu_plan->cplan.work_data);
@@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
665
  GGML_UNUSED(backend);
666
  }
667
 
668
- static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
669
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
670
 
671
  ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
673
  GGML_UNUSED(backend);
674
  }
675
 
676
- static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
677
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
678
 
679
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -690,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
690
  return true;
691
  }
692
 
693
- static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
694
  switch (op->op) {
695
  case GGML_OP_MUL_MAT:
696
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
@@ -732,7 +732,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
732
  return cpu_backend;
733
  }
734
 
735
- bool ggml_backend_is_cpu(ggml_backend_t backend) {
736
  return backend && backend->iface.get_name == ggml_backend_cpu_name;
737
  }
738
 
@@ -743,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
743
  ctx->n_threads = n_threads;
744
  }
745
 
746
- ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
747
  return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
748
  }
749
 
750
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
751
  return ggml_backend_cpu_init();
752
 
753
  GGML_UNUSED(params);
 
19
  return buft->iface.get_name(buft);
20
  }
21
 
22
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
23
  return buft->iface.alloc_buffer(buft, size);
24
  }
25
 
 
27
  return buft->iface.get_alignment(buft);
28
  }
29
 
30
+ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
  // get_alloc_size is optional, defaults to ggml_nbytes
32
  if (buft->iface.get_alloc_size) {
33
  return buft->iface.get_alloc_size(buft, tensor);
 
48
 
49
  // backend buffer
50
 
51
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
52
  ggml_backend_buffer_type_t buft,
53
  struct ggml_backend_buffer_i iface,
54
  ggml_backend_buffer_context_t context,
 
95
  return base;
96
  }
97
 
98
+ GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
99
  // init_tensor is optional
100
  if (buffer->iface.init_tensor) {
101
  buffer->iface.init_tensor(buffer, tensor);
 
191
  }
192
  }
193
 
194
+ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
195
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
196
 
197
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
 
201
  tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
202
  }
203
 
204
+ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
205
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
206
 
207
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
 
318
  static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
319
  static size_t ggml_backend_registry_count = 0;
320
 
321
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
322
 
323
+ GGML_CALL static void ggml_backend_registry_init(void) {
324
  static bool initialized = false;
325
 
326
  if (initialized) {
 
333
 
334
  // add forward decls here to avoid including the backend headers
335
  #ifdef GGML_USE_CUBLAS
336
+ extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
337
  ggml_backend_cuda_reg_devices();
338
  #endif
339
 
340
  #ifdef GGML_USE_METAL
341
+ extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
+ extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
343
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
344
  #endif
345
  }
346
 
347
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
348
  GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
349
 
350
  size_t id = ggml_backend_registry_count;
 
439
 
440
  // backend CPU
441
 
442
+ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
443
  return "CPU";
444
 
445
  GGML_UNUSED(buffer);
446
  }
447
 
448
+ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
449
  return (void *)buffer->context;
450
  }
451
 
452
+ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
453
  free(buffer->context);
454
  }
455
 
456
+ GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
457
  memcpy((char *)tensor->data + offset, data, size);
458
 
459
  GGML_UNUSED(buffer);
460
  }
461
 
462
+ GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
463
  memcpy(data, (const char *)tensor->data + offset, size);
464
 
465
  GGML_UNUSED(buffer);
466
  }
467
 
468
+ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
469
  if (ggml_backend_buffer_is_host(src->buffer)) {
470
  memcpy(dst->data, src->data, ggml_nbytes(src));
471
  return true;
 
475
  GGML_UNUSED(buffer);
476
  }
477
 
478
+ GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
479
  memset(buffer->context, value, buffer->size);
480
  }
481
 
 
506
 
507
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
508
 
509
+ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
510
  return "CPU";
511
 
512
  GGML_UNUSED(buft);
513
  }
514
 
515
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
516
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
517
  void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
518
 
 
521
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
522
  }
523
 
524
+ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
525
  return TENSOR_ALIGNMENT;
526
 
527
  GGML_UNUSED(buft);
528
  }
529
 
530
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
531
  return ggml_backend_is_cpu(backend);
532
 
533
  GGML_UNUSED(buft);
534
  }
535
 
536
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
537
  return true;
538
 
539
  GGML_UNUSED(buft);
540
  }
541
 
542
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
543
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
544
  /* .iface = */ {
545
  /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
 
561
 
562
  #include <hbwmalloc.h>
563
 
564
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
565
  return "CPU_HBM";
566
 
567
  GGML_UNUSED(buft);
568
  }
569
 
570
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
571
  return "CPU_HBM";
572
 
573
  GGML_UNUSED(buf);
574
  }
575
 
576
+ GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
577
  hbw_free(buffer->context);
578
  }
579
 
580
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
581
  //void * ptr = hbw_malloc(size);
582
  void * ptr;
583
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
 
617
  size_t work_size;
618
  };
619
 
620
+ GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
621
  return "CPU";
622
 
623
  GGML_UNUSED(backend);
624
  }
625
 
626
+ GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
627
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
628
  free(cpu_ctx->work_data);
629
  free(cpu_ctx);
630
  free(backend);
631
  }
632
 
633
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
634
  return ggml_backend_cpu_buffer_type();
635
 
636
  GGML_UNUSED(backend);
 
641
  struct ggml_cgraph cgraph;
642
  };
643
 
644
+ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
645
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
646
 
647
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
656
  return cpu_plan;
657
  }
658
 
659
+ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
660
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
661
 
662
  free(cpu_plan->cplan.work_data);
 
665
  GGML_UNUSED(backend);
666
  }
667
 
668
+ GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
669
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
670
 
671
  ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
 
673
  GGML_UNUSED(backend);
674
  }
675
 
676
+ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
677
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
678
 
679
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
 
690
  return true;
691
  }
692
 
693
+ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
694
  switch (op->op) {
695
  case GGML_OP_MUL_MAT:
696
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
 
732
  return cpu_backend;
733
  }
734
 
735
+ GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
736
  return backend && backend->iface.get_name == ggml_backend_cpu_name;
737
  }
738
 
 
743
  ctx->n_threads = n_threads;
744
  }
745
 
746
+ GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
747
  return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
748
  }
749
 
750
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
751
  return ggml_backend_cpu_init();
752
 
753
  GGML_UNUSED(params);
ggml-backend.h CHANGED
@@ -17,12 +17,12 @@ extern "C" {
17
  //
18
 
19
  // buffer type
20
- GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
21
- GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
22
- GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
23
- GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
24
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
25
- GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
26
 
27
  // buffer
28
  enum ggml_backend_buffer_usage {
@@ -30,18 +30,18 @@ extern "C" {
30
  GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
31
  };
32
 
33
- GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
34
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
35
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
36
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
37
- GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
39
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
41
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
42
- GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
43
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
44
- GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
45
 
46
  //
47
  // Backend
@@ -58,8 +58,8 @@ extern "C" {
58
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
59
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
60
 
61
- GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
62
- GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
63
 
64
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
65
 
@@ -80,13 +80,13 @@ extern "C" {
80
 
81
  GGML_API ggml_backend_t ggml_backend_cpu_init(void);
82
 
83
- GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
84
- GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
85
 
86
  // Create a backend buffer from an existing pointer
87
- GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
88
 
89
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
90
 
91
  #ifdef GGML_USE_CPU_HBM
92
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
@@ -183,7 +183,7 @@ extern "C" {
183
  GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
184
  GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
185
 
186
- typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
187
 
188
  // Compare the output of two backends
189
  GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
 
17
  //
18
 
19
  // buffer type
20
+ GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
21
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
22
+ GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
23
+ GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
24
+ GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
25
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
26
 
27
  // buffer
28
  enum ggml_backend_buffer_usage {
 
30
  GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
31
  };
32
 
33
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
34
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
35
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
36
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
37
+ GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
39
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
41
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
42
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
43
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
44
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
45
 
46
  //
47
  // Backend
 
58
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
59
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
60
 
61
+ GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
62
+ GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
63
 
64
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
65
 
 
80
 
81
  GGML_API ggml_backend_t ggml_backend_cpu_init(void);
82
 
83
+ GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
84
+ GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
85
 
86
  // Create a backend buffer from an existing pointer
87
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
88
 
89
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
90
 
91
  #ifdef GGML_USE_CPU_HBM
92
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 
183
  GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
184
  GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
185
 
186
+ typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
187
 
188
  // Compare the output of two backends
189
  GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
ggml-cuda.cu CHANGED
@@ -7615,11 +7615,11 @@ struct cuda_pool_alloc {
7615
 
7616
  static bool g_cublas_loaded = false;
7617
 
7618
- bool ggml_cublas_loaded(void) {
7619
  return g_cublas_loaded;
7620
  }
7621
 
7622
- void ggml_init_cublas() {
7623
  static bool initialized = false;
7624
 
7625
  if (!initialized) {
@@ -7707,7 +7707,7 @@ void ggml_init_cublas() {
7707
  }
7708
  }
7709
 
7710
- void * ggml_cuda_host_malloc(size_t size) {
7711
  if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
7712
  return nullptr;
7713
  }
@@ -7725,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
7725
  return ptr;
7726
  }
7727
 
7728
- void ggml_cuda_host_free(void * ptr) {
7729
  CUDA_CHECK(cudaFreeHost(ptr));
7730
  }
7731
 
@@ -9242,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
9242
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
9243
  }
9244
 
9245
- bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9246
  if (!g_cublas_loaded) return false;
9247
 
9248
  const int64_t ne10 = src1->ne[0];
@@ -10013,7 +10013,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
10013
  return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
10014
  }
10015
 
10016
- static void ggml_cuda_set_main_device(const int main_device) {
10017
  if (main_device >= g_device_count) {
10018
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10019
  main_device, g_device_count, g_main_device);
@@ -10028,7 +10028,7 @@ static void ggml_cuda_set_main_device(const int main_device) {
10028
  }
10029
  }
10030
 
10031
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
10032
  if (!g_cublas_loaded) return false;
10033
 
10034
  ggml_cuda_func_t func;
@@ -10186,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
10186
  return true;
10187
  }
10188
 
10189
- int ggml_cuda_get_device_count() {
10190
  int device_count;
10191
  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
10192
  return 0;
@@ -10194,7 +10194,7 @@ int ggml_cuda_get_device_count() {
10194
  return device_count;
10195
  }
10196
 
10197
- void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10198
  cudaDeviceProp prop;
10199
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
10200
  snprintf(description, description_size, "%s", prop.name);
@@ -10244,27 +10244,27 @@ struct ggml_backend_cuda_buffer_context {
10244
  }
10245
  };
10246
 
10247
- static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
10248
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10249
  return ctx->name.c_str();
10250
  }
10251
 
10252
- static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
10253
  return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
10254
  }
10255
 
10256
- static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10257
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10258
  CUDA_CHECK(cudaFree(ctx->dev_ptr));
10259
  delete ctx;
10260
  }
10261
 
10262
- static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10263
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10264
  return ctx->dev_ptr;
10265
  }
10266
 
10267
- static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10268
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10269
 
10270
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
@@ -10296,7 +10296,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
10296
  }
10297
  }
10298
 
10299
- static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10300
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10301
 
10302
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10307,7 +10307,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
10307
  CUDA_CHECK(cudaDeviceSynchronize());
10308
  }
10309
 
10310
- static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10311
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10312
 
10313
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10318,7 +10318,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
10318
  CUDA_CHECK(cudaDeviceSynchronize());
10319
  }
10320
 
10321
- static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10322
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
10323
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10324
  ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10335,7 +10335,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co
10335
  return false;
10336
  }
10337
 
10338
- static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10339
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10340
 
10341
  ggml_cuda_set_device(ctx->device);
@@ -10357,19 +10357,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
10357
  };
10358
 
10359
  // cuda buffer type
10360
-
10361
  struct ggml_backend_cuda_buffer_type_context {
10362
  int device;
10363
  std::string name;
10364
  };
10365
 
10366
- static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
10367
  ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10368
 
10369
  return ctx->name.c_str();
10370
  }
10371
 
10372
- static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10373
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10374
 
10375
  ggml_cuda_set_device(buft_ctx->device);
@@ -10388,13 +10387,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
10388
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
10389
  }
10390
 
10391
- static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10392
  return 128;
10393
 
10394
  UNUSED(buft);
10395
  }
10396
 
10397
- static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10398
  int64_t row_low = 0;
10399
  int64_t row_high = ggml_nrows(tensor);
10400
  int64_t nrows_split = row_high - row_low;
@@ -10414,7 +10413,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
10414
  UNUSED(buft);
10415
  }
10416
 
10417
- static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10418
  if (!ggml_backend_is_cuda(backend)) {
10419
  return false;
10420
  }
@@ -10434,7 +10433,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
10434
  /* .is_host = */ NULL,
10435
  };
10436
 
10437
- ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10438
  // FIXME: this is not thread safe
10439
  if (device >= ggml_backend_cuda_get_device_count()) {
10440
  return nullptr;
@@ -10479,7 +10478,7 @@ struct ggml_backend_cuda_split_buffer_context {
10479
  std::vector<ggml_tensor_extra_gpu *> tensor_extras;
10480
  };
10481
 
10482
- static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
10483
  return GGML_CUDA_NAME "_Split";
10484
 
10485
  UNUSED(buffer);
@@ -10490,19 +10489,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_
10490
  // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
10491
  //}
10492
 
10493
- static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10494
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10495
  delete ctx;
10496
  }
10497
 
10498
- static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
10499
  // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
10500
  return (void *)0x1000;
10501
 
10502
  UNUSED(buffer);
10503
  }
10504
 
10505
- static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10506
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
10507
 
10508
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -10552,7 +10551,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
10552
  tensor->extra = extra;
10553
  }
10554
 
10555
- static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10556
  // split tensors must always be set in their entirety at once
10557
  GGML_ASSERT(offset == 0);
10558
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -10586,7 +10585,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
10586
  }
10587
  }
10588
 
10589
- static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10590
  // split tensors must always be set in their entirety at once
10591
  GGML_ASSERT(offset == 0);
10592
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -10620,7 +10619,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
10620
  }
10621
  }
10622
 
10623
- static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10624
  UNUSED(buffer);
10625
  UNUSED(value);
10626
  }
@@ -10639,13 +10638,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
10639
 
10640
  // cuda split buffer type
10641
 
10642
- static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
10643
  return GGML_CUDA_NAME "_Split";
10644
 
10645
  UNUSED(buft);
10646
  }
10647
 
10648
- static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10649
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
10650
  // instead, we allocate them for each tensor separately in init_tensor
10651
  // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
@@ -10655,13 +10654,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
10655
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
10656
  }
10657
 
10658
- static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10659
  return 128;
10660
 
10661
  UNUSED(buft);
10662
  }
10663
 
10664
- static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10665
  ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
10666
 
10667
  size_t total_size = 0;
@@ -10688,13 +10687,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu
10688
  return total_size;
10689
  }
10690
 
10691
- static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10692
  return ggml_backend_is_cuda(backend);
10693
 
10694
  UNUSED(buft);
10695
  }
10696
 
10697
- static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
10698
  return false;
10699
 
10700
  UNUSED(buft);
@@ -10709,7 +10708,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
10709
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
10710
  };
10711
 
10712
- ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
10713
  // FIXME: this is not thread safe
10714
  static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
10715
 
@@ -10745,23 +10744,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
10745
 
10746
  // host buffer type
10747
 
10748
- static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
10749
  return GGML_CUDA_NAME "_Host";
10750
 
10751
  UNUSED(buft);
10752
  }
10753
 
10754
- static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
10755
  return GGML_CUDA_NAME "_Host";
10756
 
10757
  UNUSED(buffer);
10758
  }
10759
 
10760
- static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10761
  ggml_cuda_host_free(buffer->context);
10762
  }
10763
 
10764
- static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10765
  void * ptr = ggml_cuda_host_malloc(size);
10766
 
10767
  if (ptr == nullptr) {
@@ -10777,7 +10776,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
10777
  return buffer;
10778
  }
10779
 
10780
- ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10781
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
10782
  /* .iface = */ {
10783
  /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
@@ -10795,26 +10794,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10795
 
10796
  // backend
10797
 
10798
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10799
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10800
 
10801
  return cuda_ctx->name.c_str();
10802
  }
10803
 
10804
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
10805
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10806
 
10807
  delete cuda_ctx;
10808
  delete backend;
10809
  }
10810
 
10811
- static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10812
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10813
 
10814
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
10815
  }
10816
 
10817
- static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10818
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10819
 
10820
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
@@ -10823,7 +10822,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
10823
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
10824
  }
10825
 
10826
- static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10827
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10828
 
10829
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
@@ -10832,7 +10831,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
10832
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
10833
  }
10834
 
10835
- static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
10836
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10837
 
10838
  if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
@@ -10843,7 +10842,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggm
10843
  return false;
10844
  }
10845
 
10846
- static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10847
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10848
 
10849
  CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
@@ -10851,7 +10850,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10851
  UNUSED(backend);
10852
  }
10853
 
10854
- static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10855
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10856
 
10857
  ggml_cuda_set_main_device(cuda_ctx->device);
@@ -10890,7 +10889,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
10890
  return true;
10891
  }
10892
 
10893
- static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
10894
  switch (op->op) {
10895
  case GGML_OP_UNARY:
10896
  switch (ggml_get_unary_op(op)) {
@@ -11016,7 +11015,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
11016
  /* .supports_op = */ ggml_backend_cuda_supports_op,
11017
  };
11018
 
11019
- ggml_backend_t ggml_backend_cuda_init(int device) {
11020
  ggml_init_cublas(); // TODO: remove from ggml.c
11021
 
11022
  if (device < 0 || device >= ggml_cuda_get_device_count()) {
@@ -11040,35 +11039,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
11040
  return cuda_backend;
11041
  }
11042
 
11043
- bool ggml_backend_is_cuda(ggml_backend_t backend) {
11044
  return backend && backend->iface.get_name == ggml_backend_cuda_name;
11045
  }
11046
 
11047
- int ggml_backend_cuda_get_device_count() {
11048
  return ggml_cuda_get_device_count();
11049
  }
11050
 
11051
- void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
11052
  ggml_cuda_get_device_description(device, description, description_size);
11053
  }
11054
 
11055
- void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
11056
  ggml_cuda_set_device(device);
11057
 
11058
  CUDA_CHECK(cudaMemGetInfo(free, total));
11059
  }
11060
 
11061
  // backend registry
11062
- static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
11063
  ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
11064
  return cuda_backend;
11065
 
11066
  UNUSED(params);
11067
  }
11068
 
11069
- extern "C" int ggml_backend_cuda_reg_devices();
11070
 
11071
- int ggml_backend_cuda_reg_devices() {
11072
  int device_count = ggml_cuda_get_device_count();
11073
  //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
11074
  for (int i = 0; i < device_count; i++) {
 
7615
 
7616
  static bool g_cublas_loaded = false;
7617
 
7618
+ GGML_CALL bool ggml_cublas_loaded(void) {
7619
  return g_cublas_loaded;
7620
  }
7621
 
7622
+ GGML_CALL void ggml_init_cublas() {
7623
  static bool initialized = false;
7624
 
7625
  if (!initialized) {
 
7707
  }
7708
  }
7709
 
7710
+ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
7711
  if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
7712
  return nullptr;
7713
  }
 
7725
  return ptr;
7726
  }
7727
 
7728
+ GGML_CALL void ggml_cuda_host_free(void * ptr) {
7729
  CUDA_CHECK(cudaFreeHost(ptr));
7730
  }
7731
 
 
9242
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
9243
  }
9244
 
9245
+ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9246
  if (!g_cublas_loaded) return false;
9247
 
9248
  const int64_t ne10 = src1->ne[0];
 
10013
  return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
10014
  }
10015
 
10016
+ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
10017
  if (main_device >= g_device_count) {
10018
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10019
  main_device, g_device_count, g_main_device);
 
10028
  }
10029
  }
10030
 
10031
+ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
10032
  if (!g_cublas_loaded) return false;
10033
 
10034
  ggml_cuda_func_t func;
 
10186
  return true;
10187
  }
10188
 
10189
+ GGML_CALL int ggml_cuda_get_device_count() {
10190
  int device_count;
10191
  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
10192
  return 0;
 
10194
  return device_count;
10195
  }
10196
 
10197
+ GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10198
  cudaDeviceProp prop;
10199
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
10200
  snprintf(description, description_size, "%s", prop.name);
 
10244
  }
10245
  };
10246
 
10247
+ GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
10248
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10249
  return ctx->name.c_str();
10250
  }
10251
 
10252
+ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
10253
  return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
10254
  }
10255
 
10256
+ GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10257
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10258
  CUDA_CHECK(cudaFree(ctx->dev_ptr));
10259
  delete ctx;
10260
  }
10261
 
10262
+ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10263
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10264
  return ctx->dev_ptr;
10265
  }
10266
 
10267
+ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10268
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10269
 
10270
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
 
10296
  }
10297
  }
10298
 
10299
+ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10300
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10301
 
10302
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
10307
  CUDA_CHECK(cudaDeviceSynchronize());
10308
  }
10309
 
10310
+ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10311
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10312
 
10313
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
10318
  CUDA_CHECK(cudaDeviceSynchronize());
10319
  }
10320
 
10321
+ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10322
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
10323
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10324
  ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
10335
  return false;
10336
  }
10337
 
10338
+ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10339
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10340
 
10341
  ggml_cuda_set_device(ctx->device);
 
10357
  };
10358
 
10359
  // cuda buffer type
 
10360
  struct ggml_backend_cuda_buffer_type_context {
10361
  int device;
10362
  std::string name;
10363
  };
10364
 
10365
+ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
10366
  ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10367
 
10368
  return ctx->name.c_str();
10369
  }
10370
 
10371
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10372
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10373
 
10374
  ggml_cuda_set_device(buft_ctx->device);
 
10387
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
10388
  }
10389
 
10390
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10391
  return 128;
10392
 
10393
  UNUSED(buft);
10394
  }
10395
 
10396
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10397
  int64_t row_low = 0;
10398
  int64_t row_high = ggml_nrows(tensor);
10399
  int64_t nrows_split = row_high - row_low;
 
10413
  UNUSED(buft);
10414
  }
10415
 
10416
+ GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10417
  if (!ggml_backend_is_cuda(backend)) {
10418
  return false;
10419
  }
 
10433
  /* .is_host = */ NULL,
10434
  };
10435
 
10436
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10437
  // FIXME: this is not thread safe
10438
  if (device >= ggml_backend_cuda_get_device_count()) {
10439
  return nullptr;
 
10478
  std::vector<ggml_tensor_extra_gpu *> tensor_extras;
10479
  };
10480
 
10481
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
10482
  return GGML_CUDA_NAME "_Split";
10483
 
10484
  UNUSED(buffer);
 
10489
  // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
10490
  //}
10491
 
10492
+ GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10493
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10494
  delete ctx;
10495
  }
10496
 
10497
+ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
10498
  // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
10499
  return (void *)0x1000;
10500
 
10501
  UNUSED(buffer);
10502
  }
10503
 
10504
+ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10505
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
10506
 
10507
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
 
10551
  tensor->extra = extra;
10552
  }
10553
 
10554
+ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10555
  // split tensors must always be set in their entirety at once
10556
  GGML_ASSERT(offset == 0);
10557
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
10585
  }
10586
  }
10587
 
10588
+ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10589
  // split tensors must always be set in their entirety at once
10590
  GGML_ASSERT(offset == 0);
10591
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
10619
  }
10620
  }
10621
 
10622
+ GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10623
  UNUSED(buffer);
10624
  UNUSED(value);
10625
  }
 
10638
 
10639
  // cuda split buffer type
10640
 
10641
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
10642
  return GGML_CUDA_NAME "_Split";
10643
 
10644
  UNUSED(buft);
10645
  }
10646
 
10647
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10648
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
10649
  // instead, we allocate them for each tensor separately in init_tensor
10650
  // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
 
10654
  return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
10655
  }
10656
 
10657
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10658
  return 128;
10659
 
10660
  UNUSED(buft);
10661
  }
10662
 
10663
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10664
  ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
10665
 
10666
  size_t total_size = 0;
 
10687
  return total_size;
10688
  }
10689
 
10690
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10691
  return ggml_backend_is_cuda(backend);
10692
 
10693
  UNUSED(buft);
10694
  }
10695
 
10696
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
10697
  return false;
10698
 
10699
  UNUSED(buft);
 
10708
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
10709
  };
10710
 
10711
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
10712
  // FIXME: this is not thread safe
10713
  static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
10714
 
 
10744
 
10745
  // host buffer type
10746
 
10747
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
10748
  return GGML_CUDA_NAME "_Host";
10749
 
10750
  UNUSED(buft);
10751
  }
10752
 
10753
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
10754
  return GGML_CUDA_NAME "_Host";
10755
 
10756
  UNUSED(buffer);
10757
  }
10758
 
10759
+ GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10760
  ggml_cuda_host_free(buffer->context);
10761
  }
10762
 
10763
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10764
  void * ptr = ggml_cuda_host_malloc(size);
10765
 
10766
  if (ptr == nullptr) {
 
10776
  return buffer;
10777
  }
10778
 
10779
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10780
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
10781
  /* .iface = */ {
10782
  /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
 
10794
 
10795
  // backend
10796
 
10797
+ GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10798
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10799
 
10800
  return cuda_ctx->name.c_str();
10801
  }
10802
 
10803
+ GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
10804
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10805
 
10806
  delete cuda_ctx;
10807
  delete backend;
10808
  }
10809
 
10810
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10811
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10812
 
10813
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
10814
  }
10815
 
10816
+ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10817
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10818
 
10819
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
 
10822
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
10823
  }
10824
 
10825
+ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10826
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10827
 
10828
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
 
10831
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
10832
  }
10833
 
10834
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
10835
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10836
 
10837
  if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
 
10842
  return false;
10843
  }
10844
 
10845
+ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10846
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10847
 
10848
  CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
 
10850
  UNUSED(backend);
10851
  }
10852
 
10853
+ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10854
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10855
 
10856
  ggml_cuda_set_main_device(cuda_ctx->device);
 
10889
  return true;
10890
  }
10891
 
10892
+ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
10893
  switch (op->op) {
10894
  case GGML_OP_UNARY:
10895
  switch (ggml_get_unary_op(op)) {
 
11015
  /* .supports_op = */ ggml_backend_cuda_supports_op,
11016
  };
11017
 
11018
+ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
11019
  ggml_init_cublas(); // TODO: remove from ggml.c
11020
 
11021
  if (device < 0 || device >= ggml_cuda_get_device_count()) {
 
11039
  return cuda_backend;
11040
  }
11041
 
11042
+ GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
11043
  return backend && backend->iface.get_name == ggml_backend_cuda_name;
11044
  }
11045
 
11046
+ GGML_CALL int ggml_backend_cuda_get_device_count() {
11047
  return ggml_cuda_get_device_count();
11048
  }
11049
 
11050
+ GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
11051
  ggml_cuda_get_device_description(device, description, description_size);
11052
  }
11053
 
11054
+ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
11055
  ggml_cuda_set_device(device);
11056
 
11057
  CUDA_CHECK(cudaMemGetInfo(free, total));
11058
  }
11059
 
11060
  // backend registry
11061
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
11062
  ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
11063
  return cuda_backend;
11064
 
11065
  UNUSED(params);
11066
  }
11067
 
11068
+ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
11069
 
11070
+ GGML_CALL int ggml_backend_cuda_reg_devices() {
11071
  int device_count = ggml_cuda_get_device_count();
11072
  //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
11073
  for (int i = 0; i < device_count; i++) {
ggml-cuda.h CHANGED
@@ -18,34 +18,34 @@ extern "C" {
18
  #define GGML_CUDA_MAX_DEVICES 16
19
 
20
  // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
21
- GGML_API void ggml_init_cublas(void);
22
 
23
  // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24
- GGML_API bool ggml_cublas_loaded(void);
25
 
26
- GGML_API void * ggml_cuda_host_malloc(size_t size);
27
- GGML_API void ggml_cuda_host_free(void * ptr);
28
 
29
- GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
30
- GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
31
 
32
- GGML_API int ggml_cuda_get_device_count(void);
33
- GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
34
 
35
  // backend API
36
- GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
37
 
38
- GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
39
 
40
- GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
41
  // split tensor buffer that splits matrices by rows across multiple devices
42
- GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
43
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
44
- GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
45
 
46
- GGML_API int ggml_backend_cuda_get_device_count(void);
47
- GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
48
- GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
49
 
50
  #ifdef __cplusplus
51
  }
 
18
  #define GGML_CUDA_MAX_DEVICES 16
19
 
20
  // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
21
+ GGML_API GGML_CALL void ggml_init_cublas(void);
22
 
23
  // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24
+ GGML_API GGML_CALL bool ggml_cublas_loaded(void);
25
 
26
+ GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
27
+ GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
28
 
29
+ GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
30
+ GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
31
 
32
+ GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
33
+ GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
34
 
35
  // backend API
36
+ GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
37
 
38
+ GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
39
 
40
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
41
  // split tensor buffer that splits matrices by rows across multiple devices
42
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
43
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
44
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
45
 
46
+ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
47
+ GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
48
+ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
49
 
50
  #ifdef __cplusplus
51
  }
ggml-metal.h CHANGED
@@ -47,11 +47,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
47
 
48
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
49
 
50
- GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
51
 
52
  GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
53
 
54
- GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
55
 
56
  // helper to check if the device supports a specific family
57
  // ideally, the user code should be doing these checks
 
47
 
48
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
49
 
50
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
51
 
52
  GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
53
 
54
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
55
 
56
  // helper to check if the device supports a specific family
57
  // ideally, the user code should be doing these checks
ggml-metal.m CHANGED
@@ -2294,13 +2294,13 @@ static void ggml_backend_metal_free_device(void) {
2294
  }
2295
  }
2296
 
2297
- static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
2298
  return "Metal";
2299
 
2300
  UNUSED(buffer);
2301
  }
2302
 
2303
- static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2304
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2305
 
2306
  for (int i = 0; i < ctx->n_buffers; i++) {
@@ -2315,25 +2315,25 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
2315
  free(ctx);
2316
  }
2317
 
2318
- static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2319
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2320
 
2321
  return ctx->all_data;
2322
  }
2323
 
2324
- static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2325
  memcpy((char *)tensor->data + offset, data, size);
2326
 
2327
  UNUSED(buffer);
2328
  }
2329
 
2330
- static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2331
  memcpy(data, (const char *)tensor->data + offset, size);
2332
 
2333
  UNUSED(buffer);
2334
  }
2335
 
2336
- static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
2337
  if (ggml_backend_buffer_is_host(src->buffer)) {
2338
  memcpy(dst->data, src->data, ggml_nbytes(src));
2339
  return true;
@@ -2343,7 +2343,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
2343
  UNUSED(buffer);
2344
  }
2345
 
2346
- static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2347
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2348
 
2349
  memset(ctx->all_data, value, ctx->all_size);
@@ -2363,13 +2363,13 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
2363
 
2364
  // default buffer type
2365
 
2366
- static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
2367
  return "Metal";
2368
 
2369
  UNUSED(buft);
2370
  }
2371
 
2372
- static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2373
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2374
 
2375
  const size_t size_page = sysconf(_SC_PAGESIZE);
@@ -2421,24 +2421,24 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
2421
  return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
2422
  }
2423
 
2424
- static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
2425
  return 32;
2426
  UNUSED(buft);
2427
  }
2428
 
2429
- static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
2430
  return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
2431
 
2432
  UNUSED(buft);
2433
  }
2434
 
2435
- static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2436
  return true;
2437
 
2438
  UNUSED(buft);
2439
  }
2440
 
2441
- ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2442
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
2443
  /* .iface = */ {
2444
  /* .get_name = */ ggml_backend_metal_buffer_type_get_name,
@@ -2456,7 +2456,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2456
 
2457
  // buffer from ptr
2458
 
2459
- ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
2460
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2461
 
2462
  ctx->all_data = data;
@@ -2543,31 +2543,31 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
2543
 
2544
  // backend
2545
 
2546
- static const char * ggml_backend_metal_name(ggml_backend_t backend) {
2547
  return "Metal";
2548
 
2549
  UNUSED(backend);
2550
  }
2551
 
2552
- static void ggml_backend_metal_free(ggml_backend_t backend) {
2553
  struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
2554
  ggml_metal_free(ctx);
2555
  free(backend);
2556
  }
2557
 
2558
- static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
2559
  return ggml_backend_metal_buffer_type();
2560
 
2561
  UNUSED(backend);
2562
  }
2563
 
2564
- static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2565
  struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
2566
 
2567
  return ggml_metal_graph_compute(metal_ctx, cgraph);
2568
  }
2569
 
2570
- static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
2571
  struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
2572
 
2573
  return ggml_metal_supports_op(metal_ctx, op);
@@ -2630,9 +2630,9 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
2630
  return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
2631
  }
2632
 
2633
- ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
2634
 
2635
- ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
2636
  return ggml_backend_metal_init();
2637
 
2638
  GGML_UNUSED(params);
 
2294
  }
2295
  }
2296
 
2297
+ GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
2298
  return "Metal";
2299
 
2300
  UNUSED(buffer);
2301
  }
2302
 
2303
+ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2304
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2305
 
2306
  for (int i = 0; i < ctx->n_buffers; i++) {
 
2315
  free(ctx);
2316
  }
2317
 
2318
+ GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2319
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2320
 
2321
  return ctx->all_data;
2322
  }
2323
 
2324
+ GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2325
  memcpy((char *)tensor->data + offset, data, size);
2326
 
2327
  UNUSED(buffer);
2328
  }
2329
 
2330
+ GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2331
  memcpy(data, (const char *)tensor->data + offset, size);
2332
 
2333
  UNUSED(buffer);
2334
  }
2335
 
2336
+ GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
2337
  if (ggml_backend_buffer_is_host(src->buffer)) {
2338
  memcpy(dst->data, src->data, ggml_nbytes(src));
2339
  return true;
 
2343
  UNUSED(buffer);
2344
  }
2345
 
2346
+ GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2347
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2348
 
2349
  memset(ctx->all_data, value, ctx->all_size);
 
2363
 
2364
  // default buffer type
2365
 
2366
+ GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
2367
  return "Metal";
2368
 
2369
  UNUSED(buft);
2370
  }
2371
 
2372
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2373
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2374
 
2375
  const size_t size_page = sysconf(_SC_PAGESIZE);
 
2421
  return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
2422
  }
2423
 
2424
+ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
2425
  return 32;
2426
  UNUSED(buft);
2427
  }
2428
 
2429
+ GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
2430
  return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
2431
 
2432
  UNUSED(buft);
2433
  }
2434
 
2435
+ GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2436
  return true;
2437
 
2438
  UNUSED(buft);
2439
  }
2440
 
2441
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2442
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
2443
  /* .iface = */ {
2444
  /* .get_name = */ ggml_backend_metal_buffer_type_get_name,
 
2456
 
2457
  // buffer from ptr
2458
 
2459
+ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
2460
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2461
 
2462
  ctx->all_data = data;
 
2543
 
2544
  // backend
2545
 
2546
+ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
2547
  return "Metal";
2548
 
2549
  UNUSED(backend);
2550
  }
2551
 
2552
+ GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
2553
  struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
2554
  ggml_metal_free(ctx);
2555
  free(backend);
2556
  }
2557
 
2558
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
2559
  return ggml_backend_metal_buffer_type();
2560
 
2561
  UNUSED(backend);
2562
  }
2563
 
2564
+ GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2565
  struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
2566
 
2567
  return ggml_metal_graph_compute(metal_ctx, cgraph);
2568
  }
2569
 
2570
+ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
2571
  struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
2572
 
2573
  return ggml_metal_supports_op(metal_ctx, op);
 
2630
  return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
2631
  }
2632
 
2633
+ GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
2634
 
2635
+ GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
2636
  return ggml_backend_metal_init();
2637
 
2638
  GGML_UNUSED(params);
ggml.c CHANGED
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
1990
  GGML_PRINT("%s: --- end ---\n", __func__);
1991
  }
1992
 
1993
- int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1994
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1995
 
1996
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1997
  }
1998
 
1999
- int64_t ggml_nrows(const struct ggml_tensor * tensor) {
2000
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2001
 
2002
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
2003
  }
2004
 
2005
- size_t ggml_nbytes(const struct ggml_tensor * tensor) {
2006
  size_t nbytes;
2007
  size_t blck_size = ggml_blck_size(tensor->type);
2008
  if (blck_size == 1) {
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
2025
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
2026
  }
2027
 
2028
- int ggml_blck_size(enum ggml_type type) {
2029
  return type_traits[type].blck_size;
2030
  }
2031
 
2032
- size_t ggml_type_size(enum ggml_type type) {
2033
  return type_traits[type].type_size;
2034
  }
2035
 
2036
- size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2037
  assert(ne % ggml_blck_size(type) == 0);
2038
  return ggml_type_size(type)*ne/ggml_blck_size(type);
2039
  }
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
2042
  return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2043
  }
2044
 
2045
- const char * ggml_type_name(enum ggml_type type) {
2046
  return type_traits[type].type_name;
2047
  }
2048
 
2049
- bool ggml_is_quantized(enum ggml_type type) {
2050
  return type_traits[type].is_quantized;
2051
  }
2052
 
2053
- const char * ggml_op_name(enum ggml_op op) {
2054
  return GGML_OP_NAME[op];
2055
  }
2056
 
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
2062
  return GGML_UNARY_OP_NAME[op];
2063
  }
2064
 
2065
- const char * ggml_op_desc(const struct ggml_tensor * t) {
2066
  if (t->op == GGML_OP_UNARY) {
2067
  enum ggml_unary_op uop = ggml_get_unary_op(t);
2068
  return ggml_unary_op_name(uop);
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
2072
  }
2073
  }
2074
 
2075
- size_t ggml_element_size(const struct ggml_tensor * tensor) {
2076
  return ggml_type_size(tensor->type);
2077
  }
2078
 
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
2154
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
2155
  }
2156
 
2157
- bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2158
  return tensor->nb[0] > tensor->nb[1];
2159
  }
2160
 
2161
- bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2162
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2163
 
2164
  return
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
2177
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2178
  }
2179
 
2180
- bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2181
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2182
 
2183
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
@@ -3079,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
3079
  return (float *)(tensor->data);
3080
  }
3081
 
3082
- enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3083
  GGML_ASSERT(tensor->op == GGML_OP_UNARY);
3084
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
3085
  }
@@ -11653,7 +11653,7 @@ static void ggml_rope_cache_init(
11653
  }
11654
  }
11655
 
11656
- void ggml_rope_yarn_corr_dims(
11657
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
11658
  ) {
11659
  // start and end correction dims
 
1990
  GGML_PRINT("%s: --- end ---\n", __func__);
1991
  }
1992
 
1993
+ GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1994
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1995
 
1996
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1997
  }
1998
 
1999
+ GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
2000
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2001
 
2002
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
2003
  }
2004
 
2005
+ GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
2006
  size_t nbytes;
2007
  size_t blck_size = ggml_blck_size(tensor->type);
2008
  if (blck_size == 1) {
 
2025
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
2026
  }
2027
 
2028
+ GGML_CALL int ggml_blck_size(enum ggml_type type) {
2029
  return type_traits[type].blck_size;
2030
  }
2031
 
2032
+ GGML_CALL size_t ggml_type_size(enum ggml_type type) {
2033
  return type_traits[type].type_size;
2034
  }
2035
 
2036
+ GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2037
  assert(ne % ggml_blck_size(type) == 0);
2038
  return ggml_type_size(type)*ne/ggml_blck_size(type);
2039
  }
 
2042
  return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2043
  }
2044
 
2045
+ GGML_CALL const char * ggml_type_name(enum ggml_type type) {
2046
  return type_traits[type].type_name;
2047
  }
2048
 
2049
+ GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
2050
  return type_traits[type].is_quantized;
2051
  }
2052
 
2053
+ GGML_CALL const char * ggml_op_name(enum ggml_op op) {
2054
  return GGML_OP_NAME[op];
2055
  }
2056
 
 
2062
  return GGML_UNARY_OP_NAME[op];
2063
  }
2064
 
2065
+ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
2066
  if (t->op == GGML_OP_UNARY) {
2067
  enum ggml_unary_op uop = ggml_get_unary_op(t);
2068
  return ggml_unary_op_name(uop);
 
2072
  }
2073
  }
2074
 
2075
+ GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
2076
  return ggml_type_size(tensor->type);
2077
  }
2078
 
 
2154
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
2155
  }
2156
 
2157
+ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2158
  return tensor->nb[0] > tensor->nb[1];
2159
  }
2160
 
2161
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2162
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2163
 
2164
  return
 
2177
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2178
  }
2179
 
2180
+ GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2181
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2182
 
2183
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
 
3079
  return (float *)(tensor->data);
3080
  }
3081
 
3082
+ GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3083
  GGML_ASSERT(tensor->op == GGML_OP_UNARY);
3084
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
3085
  }
 
11653
  }
11654
  }
11655
 
11656
+ GGML_CALL void ggml_rope_yarn_corr_dims(
11657
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
11658
  ) {
11659
  // start and end correction dims
ggml.h CHANGED
@@ -187,6 +187,16 @@
187
  # define GGML_API
188
  #endif
189
 
 
 
 
 
 
 
 
 
 
 
190
  // TODO: support for clang
191
  #ifdef __GNUC__
192
  # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -649,41 +659,41 @@ extern "C" {
649
  GGML_API void ggml_print_object (const struct ggml_object * obj);
650
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
651
 
652
- GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
653
- GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
654
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
655
- GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
656
 
657
- GGML_API int ggml_blck_size(enum ggml_type type);
658
- GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
659
- GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
660
 
661
  GGML_DEPRECATED(
662
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
663
  "use ggml_row_size() instead");
664
 
665
- GGML_API const char * ggml_type_name(enum ggml_type type);
666
- GGML_API const char * ggml_op_name (enum ggml_op op);
667
- GGML_API const char * ggml_op_symbol(enum ggml_op op);
668
 
669
- GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
670
- GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
671
 
672
- GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
673
 
674
- GGML_API bool ggml_is_quantized(enum ggml_type type);
675
 
676
  // TODO: temporary until model loading of ggml examples is refactored
677
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
678
 
679
- GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
680
- GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
681
- GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
682
- GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
683
- GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
684
- GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
685
- GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
686
- GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
687
 
688
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
689
 
@@ -770,7 +780,7 @@ extern "C" {
770
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
771
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
772
 
773
- GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
774
 
775
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
776
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
@@ -1413,7 +1423,7 @@ extern "C" {
1413
  float beta_slow);
1414
 
1415
  // compute correction dims for YaRN RoPE scaling
1416
- void ggml_rope_yarn_corr_dims(
1417
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1418
 
1419
  // xPos RoPE, in-place, returns view(a)
 
187
  # define GGML_API
188
  #endif
189
 
190
+ #ifdef GGML_MULTIPLATFORM
191
+ # if defined(_WIN32)
192
+ # define GGML_CALL
193
+ # else
194
+ # define GGML_CALL __attribute__((__ms_abi__))
195
+ # endif
196
+ #else
197
+ # define GGML_CALL
198
+ #endif
199
+
200
  // TODO: support for clang
201
  #ifdef __GNUC__
202
  # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
 
659
  GGML_API void ggml_print_object (const struct ggml_object * obj);
660
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
661
 
662
+ GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
663
+ GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
664
+ GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
665
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
666
 
667
+ GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
668
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
669
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
670
 
671
  GGML_DEPRECATED(
672
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
673
  "use ggml_row_size() instead");
674
 
675
+ GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
676
+ GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
677
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
678
 
679
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
680
+ GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
681
 
682
+ GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
683
 
684
+ GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
685
 
686
  // TODO: temporary until model loading of ggml examples is refactored
687
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
688
 
689
+ GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
690
+ GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
691
+ GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
692
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
693
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
694
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
695
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
696
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
697
 
698
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
699
 
 
780
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
781
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
782
 
783
+ GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
784
 
785
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
786
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
 
1423
  float beta_slow);
1424
 
1425
  // compute correction dims for YaRN RoPE scaling
1426
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1427
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1428
 
1429
  // xPos RoPE, in-place, returns view(a)