Spaces:
Running
Running
ggml : introduce GGML_CALL function annotation (llama/4850)
Browse filesThis change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.
This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
- ggml-backend-impl.h +30 -30
- ggml-backend.c +40 -40
- ggml-backend.h +25 -25
- ggml-cuda.cu +60 -61
- ggml-cuda.h +16 -16
- ggml-metal.h +2 -2
- ggml-metal.m +21 -21
- ggml.c +16 -16
- ggml.h +34 -24
ggml-backend-impl.h
CHANGED
|
@@ -16,14 +16,14 @@ extern "C" {
|
|
| 16 |
typedef void * ggml_backend_buffer_type_context_t;
|
| 17 |
|
| 18 |
struct ggml_backend_buffer_type_i {
|
| 19 |
-
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
| 20 |
-
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
| 21 |
-
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
| 22 |
-
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
| 23 |
-
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
| 24 |
// check if tensor data is in host memory
|
| 25 |
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
| 26 |
-
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
| 27 |
};
|
| 28 |
|
| 29 |
struct ggml_backend_buffer_type {
|
|
@@ -35,15 +35,15 @@ extern "C" {
|
|
| 35 |
typedef void * ggml_backend_buffer_context_t;
|
| 36 |
|
| 37 |
struct ggml_backend_buffer_i {
|
| 38 |
-
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
| 39 |
-
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
| 40 |
-
void * (*get_base) (ggml_backend_buffer_t buffer);
|
| 41 |
-
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 42 |
-
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 43 |
-
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 44 |
-
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
| 45 |
-
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
| 46 |
-
void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
| 47 |
};
|
| 48 |
|
| 49 |
struct ggml_backend_buffer {
|
|
@@ -54,7 +54,7 @@ extern "C" {
|
|
| 54 |
enum ggml_backend_buffer_usage usage;
|
| 55 |
};
|
| 56 |
|
| 57 |
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
| 58 |
ggml_backend_buffer_type_t buft,
|
| 59 |
struct ggml_backend_buffer_i iface,
|
| 60 |
ggml_backend_buffer_context_t context,
|
|
@@ -70,31 +70,31 @@ extern "C" {
|
|
| 70 |
typedef void * ggml_backend_context_t;
|
| 71 |
|
| 72 |
struct ggml_backend_i {
|
| 73 |
-
const char * (*get_name)(ggml_backend_t backend);
|
| 74 |
|
| 75 |
-
void (*free)(ggml_backend_t backend);
|
| 76 |
|
| 77 |
// buffer allocation
|
| 78 |
-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
| 79 |
|
| 80 |
// (optional) asynchronous tensor data access
|
| 81 |
-
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 82 |
-
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 83 |
-
bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 84 |
|
| 85 |
// (optional) complete all pending operations
|
| 86 |
-
void (*synchronize)(ggml_backend_t backend);
|
| 87 |
|
| 88 |
// compute graph with a plan
|
| 89 |
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 90 |
-
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 91 |
-
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 92 |
|
| 93 |
// compute graph without a plan (async)
|
| 94 |
-
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 95 |
|
| 96 |
// check if the backend supports an operation
|
| 97 |
-
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 98 |
};
|
| 99 |
|
| 100 |
struct ggml_backend {
|
|
@@ -107,9 +107,9 @@ extern "C" {
|
|
| 107 |
// Backend registry
|
| 108 |
//
|
| 109 |
|
| 110 |
-
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
| 111 |
|
| 112 |
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
| 113 |
|
| 114 |
#ifdef __cplusplus
|
| 115 |
}
|
|
|
|
| 16 |
typedef void * ggml_backend_buffer_type_context_t;
|
| 17 |
|
| 18 |
struct ggml_backend_buffer_type_i {
|
| 19 |
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
| 20 |
+
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
| 21 |
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
| 22 |
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
| 23 |
+
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
| 24 |
// check if tensor data is in host memory
|
| 25 |
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
| 26 |
+
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
| 27 |
};
|
| 28 |
|
| 29 |
struct ggml_backend_buffer_type {
|
|
|
|
| 35 |
typedef void * ggml_backend_buffer_context_t;
|
| 36 |
|
| 37 |
struct ggml_backend_buffer_i {
|
| 38 |
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
| 39 |
+
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
| 40 |
+
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
| 41 |
+
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 42 |
+
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 43 |
+
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 44 |
+
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
| 45 |
+
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
| 46 |
+
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
| 47 |
};
|
| 48 |
|
| 49 |
struct ggml_backend_buffer {
|
|
|
|
| 54 |
enum ggml_backend_buffer_usage usage;
|
| 55 |
};
|
| 56 |
|
| 57 |
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
| 58 |
ggml_backend_buffer_type_t buft,
|
| 59 |
struct ggml_backend_buffer_i iface,
|
| 60 |
ggml_backend_buffer_context_t context,
|
|
|
|
| 70 |
typedef void * ggml_backend_context_t;
|
| 71 |
|
| 72 |
struct ggml_backend_i {
|
| 73 |
+
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
| 74 |
|
| 75 |
+
void (*GGML_CALL free)(ggml_backend_t backend);
|
| 76 |
|
| 77 |
// buffer allocation
|
| 78 |
+
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
| 79 |
|
| 80 |
// (optional) asynchronous tensor data access
|
| 81 |
+
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 82 |
+
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 83 |
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 84 |
|
| 85 |
// (optional) complete all pending operations
|
| 86 |
+
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
| 87 |
|
| 88 |
// compute graph with a plan
|
| 89 |
+
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 90 |
+
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 91 |
+
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 92 |
|
| 93 |
// compute graph without a plan (async)
|
| 94 |
+
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 95 |
|
| 96 |
// check if the backend supports an operation
|
| 97 |
+
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 98 |
};
|
| 99 |
|
| 100 |
struct ggml_backend {
|
|
|
|
| 107 |
// Backend registry
|
| 108 |
//
|
| 109 |
|
| 110 |
+
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
| 111 |
|
| 112 |
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
| 113 |
|
| 114 |
#ifdef __cplusplus
|
| 115 |
}
|
ggml-backend.c
CHANGED
|
@@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
| 19 |
return buft->iface.get_name(buft);
|
| 20 |
}
|
| 21 |
|
| 22 |
-
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 23 |
return buft->iface.alloc_buffer(buft, size);
|
| 24 |
}
|
| 25 |
|
|
@@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
| 27 |
return buft->iface.get_alignment(buft);
|
| 28 |
}
|
| 29 |
|
| 30 |
-
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
| 31 |
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 32 |
if (buft->iface.get_alloc_size) {
|
| 33 |
return buft->iface.get_alloc_size(buft, tensor);
|
|
@@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
| 48 |
|
| 49 |
// backend buffer
|
| 50 |
|
| 51 |
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
| 52 |
ggml_backend_buffer_type_t buft,
|
| 53 |
struct ggml_backend_buffer_i iface,
|
| 54 |
ggml_backend_buffer_context_t context,
|
|
@@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
| 95 |
return base;
|
| 96 |
}
|
| 97 |
|
| 98 |
-
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 99 |
// init_tensor is optional
|
| 100 |
if (buffer->iface.init_tensor) {
|
| 101 |
buffer->iface.init_tensor(buffer, tensor);
|
|
@@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
| 191 |
}
|
| 192 |
}
|
| 193 |
|
| 194 |
-
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 195 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 196 |
|
| 197 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
@@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
|
|
| 201 |
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
| 202 |
}
|
| 203 |
|
| 204 |
-
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 205 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 206 |
|
| 207 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
@@ -318,9 +318,9 @@ struct ggml_backend_reg {
|
|
| 318 |
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
| 319 |
static size_t ggml_backend_registry_count = 0;
|
| 320 |
|
| 321 |
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
| 322 |
|
| 323 |
-
static void ggml_backend_registry_init(void) {
|
| 324 |
static bool initialized = false;
|
| 325 |
|
| 326 |
if (initialized) {
|
|
@@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) {
|
|
| 333 |
|
| 334 |
// add forward decls here to avoid including the backend headers
|
| 335 |
#ifdef GGML_USE_CUBLAS
|
| 336 |
-
extern void ggml_backend_cuda_reg_devices(void);
|
| 337 |
ggml_backend_cuda_reg_devices();
|
| 338 |
#endif
|
| 339 |
|
| 340 |
#ifdef GGML_USE_METAL
|
| 341 |
-
extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
| 342 |
-
extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 343 |
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
| 344 |
#endif
|
| 345 |
}
|
| 346 |
|
| 347 |
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
| 348 |
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
| 349 |
|
| 350 |
size_t id = ggml_backend_registry_count;
|
|
@@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
| 439 |
|
| 440 |
// backend CPU
|
| 441 |
|
| 442 |
-
static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
| 443 |
return "CPU";
|
| 444 |
|
| 445 |
GGML_UNUSED(buffer);
|
| 446 |
}
|
| 447 |
|
| 448 |
-
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 449 |
return (void *)buffer->context;
|
| 450 |
}
|
| 451 |
|
| 452 |
-
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 453 |
free(buffer->context);
|
| 454 |
}
|
| 455 |
|
| 456 |
-
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 457 |
memcpy((char *)tensor->data + offset, data, size);
|
| 458 |
|
| 459 |
GGML_UNUSED(buffer);
|
| 460 |
}
|
| 461 |
|
| 462 |
-
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 463 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 464 |
|
| 465 |
GGML_UNUSED(buffer);
|
| 466 |
}
|
| 467 |
|
| 468 |
-
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 469 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 470 |
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 471 |
return true;
|
|
@@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
|
| 475 |
GGML_UNUSED(buffer);
|
| 476 |
}
|
| 477 |
|
| 478 |
-
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 479 |
memset(buffer->context, value, buffer->size);
|
| 480 |
}
|
| 481 |
|
|
@@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
| 506 |
|
| 507 |
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
| 508 |
|
| 509 |
-
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 510 |
return "CPU";
|
| 511 |
|
| 512 |
GGML_UNUSED(buft);
|
| 513 |
}
|
| 514 |
|
| 515 |
-
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 516 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 517 |
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
| 518 |
|
|
@@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
|
|
| 521 |
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
| 522 |
}
|
| 523 |
|
| 524 |
-
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 525 |
return TENSOR_ALIGNMENT;
|
| 526 |
|
| 527 |
GGML_UNUSED(buft);
|
| 528 |
}
|
| 529 |
|
| 530 |
-
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 531 |
return ggml_backend_is_cpu(backend);
|
| 532 |
|
| 533 |
GGML_UNUSED(buft);
|
| 534 |
}
|
| 535 |
|
| 536 |
-
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 537 |
return true;
|
| 538 |
|
| 539 |
GGML_UNUSED(buft);
|
| 540 |
}
|
| 541 |
|
| 542 |
-
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 543 |
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 544 |
/* .iface = */ {
|
| 545 |
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
@@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
| 561 |
|
| 562 |
#include <hbwmalloc.h>
|
| 563 |
|
| 564 |
-
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 565 |
return "CPU_HBM";
|
| 566 |
|
| 567 |
GGML_UNUSED(buft);
|
| 568 |
}
|
| 569 |
|
| 570 |
-
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
| 571 |
return "CPU_HBM";
|
| 572 |
|
| 573 |
GGML_UNUSED(buf);
|
| 574 |
}
|
| 575 |
|
| 576 |
-
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 577 |
hbw_free(buffer->context);
|
| 578 |
}
|
| 579 |
|
| 580 |
-
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 581 |
//void * ptr = hbw_malloc(size);
|
| 582 |
void * ptr;
|
| 583 |
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
@@ -617,20 +617,20 @@ struct ggml_backend_cpu_context {
|
|
| 617 |
size_t work_size;
|
| 618 |
};
|
| 619 |
|
| 620 |
-
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
| 621 |
return "CPU";
|
| 622 |
|
| 623 |
GGML_UNUSED(backend);
|
| 624 |
}
|
| 625 |
|
| 626 |
-
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
| 627 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 628 |
free(cpu_ctx->work_data);
|
| 629 |
free(cpu_ctx);
|
| 630 |
free(backend);
|
| 631 |
}
|
| 632 |
|
| 633 |
-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
| 634 |
return ggml_backend_cpu_buffer_type();
|
| 635 |
|
| 636 |
GGML_UNUSED(backend);
|
|
@@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu {
|
|
| 641 |
struct ggml_cgraph cgraph;
|
| 642 |
};
|
| 643 |
|
| 644 |
-
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
| 645 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 646 |
|
| 647 |
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
|
@@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|
| 656 |
return cpu_plan;
|
| 657 |
}
|
| 658 |
|
| 659 |
-
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 660 |
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 661 |
|
| 662 |
free(cpu_plan->cplan.work_data);
|
|
@@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
|
|
| 665 |
GGML_UNUSED(backend);
|
| 666 |
}
|
| 667 |
|
| 668 |
-
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 669 |
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 670 |
|
| 671 |
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
@@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
| 673 |
GGML_UNUSED(backend);
|
| 674 |
}
|
| 675 |
|
| 676 |
-
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 677 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 678 |
|
| 679 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
@@ -690,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
| 690 |
return true;
|
| 691 |
}
|
| 692 |
|
| 693 |
-
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 694 |
switch (op->op) {
|
| 695 |
case GGML_OP_MUL_MAT:
|
| 696 |
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
@@ -732,7 +732,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
| 732 |
return cpu_backend;
|
| 733 |
}
|
| 734 |
|
| 735 |
-
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
| 736 |
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
| 737 |
}
|
| 738 |
|
|
@@ -743,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
| 743 |
ctx->n_threads = n_threads;
|
| 744 |
}
|
| 745 |
|
| 746 |
-
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
| 747 |
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
| 748 |
}
|
| 749 |
|
| 750 |
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
| 751 |
return ggml_backend_cpu_init();
|
| 752 |
|
| 753 |
GGML_UNUSED(params);
|
|
|
|
| 19 |
return buft->iface.get_name(buft);
|
| 20 |
}
|
| 21 |
|
| 22 |
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 23 |
return buft->iface.alloc_buffer(buft, size);
|
| 24 |
}
|
| 25 |
|
|
|
|
| 27 |
return buft->iface.get_alignment(buft);
|
| 28 |
}
|
| 29 |
|
| 30 |
+
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
| 31 |
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 32 |
if (buft->iface.get_alloc_size) {
|
| 33 |
return buft->iface.get_alloc_size(buft, tensor);
|
|
|
|
| 48 |
|
| 49 |
// backend buffer
|
| 50 |
|
| 51 |
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
| 52 |
ggml_backend_buffer_type_t buft,
|
| 53 |
struct ggml_backend_buffer_i iface,
|
| 54 |
ggml_backend_buffer_context_t context,
|
|
|
|
| 95 |
return base;
|
| 96 |
}
|
| 97 |
|
| 98 |
+
GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 99 |
// init_tensor is optional
|
| 100 |
if (buffer->iface.init_tensor) {
|
| 101 |
buffer->iface.init_tensor(buffer, tensor);
|
|
|
|
| 191 |
}
|
| 192 |
}
|
| 193 |
|
| 194 |
+
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 195 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 196 |
|
| 197 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
|
|
| 201 |
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
| 202 |
}
|
| 203 |
|
| 204 |
+
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 205 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 206 |
|
| 207 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
|
|
| 318 |
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
| 319 |
static size_t ggml_backend_registry_count = 0;
|
| 320 |
|
| 321 |
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
| 322 |
|
| 323 |
+
GGML_CALL static void ggml_backend_registry_init(void) {
|
| 324 |
static bool initialized = false;
|
| 325 |
|
| 326 |
if (initialized) {
|
|
|
|
| 333 |
|
| 334 |
// add forward decls here to avoid including the backend headers
|
| 335 |
#ifdef GGML_USE_CUBLAS
|
| 336 |
+
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
| 337 |
ggml_backend_cuda_reg_devices();
|
| 338 |
#endif
|
| 339 |
|
| 340 |
#ifdef GGML_USE_METAL
|
| 341 |
+
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
| 342 |
+
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 343 |
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
| 344 |
#endif
|
| 345 |
}
|
| 346 |
|
| 347 |
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
| 348 |
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
| 349 |
|
| 350 |
size_t id = ggml_backend_registry_count;
|
|
|
|
| 439 |
|
| 440 |
// backend CPU
|
| 441 |
|
| 442 |
+
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
| 443 |
return "CPU";
|
| 444 |
|
| 445 |
GGML_UNUSED(buffer);
|
| 446 |
}
|
| 447 |
|
| 448 |
+
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 449 |
return (void *)buffer->context;
|
| 450 |
}
|
| 451 |
|
| 452 |
+
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 453 |
free(buffer->context);
|
| 454 |
}
|
| 455 |
|
| 456 |
+
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 457 |
memcpy((char *)tensor->data + offset, data, size);
|
| 458 |
|
| 459 |
GGML_UNUSED(buffer);
|
| 460 |
}
|
| 461 |
|
| 462 |
+
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 463 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 464 |
|
| 465 |
GGML_UNUSED(buffer);
|
| 466 |
}
|
| 467 |
|
| 468 |
+
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 469 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 470 |
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 471 |
return true;
|
|
|
|
| 475 |
GGML_UNUSED(buffer);
|
| 476 |
}
|
| 477 |
|
| 478 |
+
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 479 |
memset(buffer->context, value, buffer->size);
|
| 480 |
}
|
| 481 |
|
|
|
|
| 506 |
|
| 507 |
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
| 508 |
|
| 509 |
+
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 510 |
return "CPU";
|
| 511 |
|
| 512 |
GGML_UNUSED(buft);
|
| 513 |
}
|
| 514 |
|
| 515 |
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 516 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 517 |
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
| 518 |
|
|
|
|
| 521 |
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
| 522 |
}
|
| 523 |
|
| 524 |
+
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 525 |
return TENSOR_ALIGNMENT;
|
| 526 |
|
| 527 |
GGML_UNUSED(buft);
|
| 528 |
}
|
| 529 |
|
| 530 |
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 531 |
return ggml_backend_is_cpu(backend);
|
| 532 |
|
| 533 |
GGML_UNUSED(buft);
|
| 534 |
}
|
| 535 |
|
| 536 |
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 537 |
return true;
|
| 538 |
|
| 539 |
GGML_UNUSED(buft);
|
| 540 |
}
|
| 541 |
|
| 542 |
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 543 |
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 544 |
/* .iface = */ {
|
| 545 |
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
|
|
| 561 |
|
| 562 |
#include <hbwmalloc.h>
|
| 563 |
|
| 564 |
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 565 |
return "CPU_HBM";
|
| 566 |
|
| 567 |
GGML_UNUSED(buft);
|
| 568 |
}
|
| 569 |
|
| 570 |
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
| 571 |
return "CPU_HBM";
|
| 572 |
|
| 573 |
GGML_UNUSED(buf);
|
| 574 |
}
|
| 575 |
|
| 576 |
+
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 577 |
hbw_free(buffer->context);
|
| 578 |
}
|
| 579 |
|
| 580 |
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 581 |
//void * ptr = hbw_malloc(size);
|
| 582 |
void * ptr;
|
| 583 |
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
|
|
| 617 |
size_t work_size;
|
| 618 |
};
|
| 619 |
|
| 620 |
+
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
| 621 |
return "CPU";
|
| 622 |
|
| 623 |
GGML_UNUSED(backend);
|
| 624 |
}
|
| 625 |
|
| 626 |
+
GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
| 627 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 628 |
free(cpu_ctx->work_data);
|
| 629 |
free(cpu_ctx);
|
| 630 |
free(backend);
|
| 631 |
}
|
| 632 |
|
| 633 |
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
| 634 |
return ggml_backend_cpu_buffer_type();
|
| 635 |
|
| 636 |
GGML_UNUSED(backend);
|
|
|
|
| 641 |
struct ggml_cgraph cgraph;
|
| 642 |
};
|
| 643 |
|
| 644 |
+
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
| 645 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 646 |
|
| 647 |
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
|
|
|
| 656 |
return cpu_plan;
|
| 657 |
}
|
| 658 |
|
| 659 |
+
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 660 |
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 661 |
|
| 662 |
free(cpu_plan->cplan.work_data);
|
|
|
|
| 665 |
GGML_UNUSED(backend);
|
| 666 |
}
|
| 667 |
|
| 668 |
+
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 669 |
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 670 |
|
| 671 |
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
|
|
| 673 |
GGML_UNUSED(backend);
|
| 674 |
}
|
| 675 |
|
| 676 |
+
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 677 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 678 |
|
| 679 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
|
|
| 690 |
return true;
|
| 691 |
}
|
| 692 |
|
| 693 |
+
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 694 |
switch (op->op) {
|
| 695 |
case GGML_OP_MUL_MAT:
|
| 696 |
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
|
|
| 732 |
return cpu_backend;
|
| 733 |
}
|
| 734 |
|
| 735 |
+
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
| 736 |
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
| 737 |
}
|
| 738 |
|
|
|
|
| 743 |
ctx->n_threads = n_threads;
|
| 744 |
}
|
| 745 |
|
| 746 |
+
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
| 747 |
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
| 748 |
}
|
| 749 |
|
| 750 |
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
| 751 |
return ggml_backend_cpu_init();
|
| 752 |
|
| 753 |
GGML_UNUSED(params);
|
ggml-backend.h
CHANGED
|
@@ -17,12 +17,12 @@ extern "C" {
|
|
| 17 |
//
|
| 18 |
|
| 19 |
// buffer type
|
| 20 |
-
GGML_API
|
| 21 |
-
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
| 22 |
-
GGML_API
|
| 23 |
-
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 24 |
-
GGML_API
|
| 25 |
-
GGML_API
|
| 26 |
|
| 27 |
// buffer
|
| 28 |
enum ggml_backend_buffer_usage {
|
|
@@ -30,18 +30,18 @@ extern "C" {
|
|
| 30 |
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
| 31 |
};
|
| 32 |
|
| 33 |
-
GGML_API
|
| 34 |
-
GGML_API
|
| 35 |
-
GGML_API
|
| 36 |
-
GGML_API
|
| 37 |
-
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 38 |
-
GGML_API
|
| 39 |
-
GGML_API
|
| 40 |
-
GGML_API
|
| 41 |
-
GGML_API
|
| 42 |
-
GGML_API
|
| 43 |
-
GGML_API
|
| 44 |
-
GGML_API
|
| 45 |
|
| 46 |
//
|
| 47 |
// Backend
|
|
@@ -58,8 +58,8 @@ extern "C" {
|
|
| 58 |
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 59 |
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 60 |
|
| 61 |
-
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 62 |
-
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 63 |
|
| 64 |
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
| 65 |
|
|
@@ -80,13 +80,13 @@ extern "C" {
|
|
| 80 |
|
| 81 |
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
| 82 |
|
| 83 |
-
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
| 84 |
-
GGML_API
|
| 85 |
|
| 86 |
// Create a backend buffer from an existing pointer
|
| 87 |
-
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
| 88 |
|
| 89 |
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
| 90 |
|
| 91 |
#ifdef GGML_USE_CPU_HBM
|
| 92 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
|
@@ -183,7 +183,7 @@ extern "C" {
|
|
| 183 |
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
| 184 |
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
| 185 |
|
| 186 |
-
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
| 187 |
|
| 188 |
// Compare the output of two backends
|
| 189 |
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
|
|
|
| 17 |
//
|
| 18 |
|
| 19 |
// buffer type
|
| 20 |
+
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
| 21 |
+
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
| 22 |
+
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 23 |
+
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 24 |
+
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
| 25 |
+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 26 |
|
| 27 |
// buffer
|
| 28 |
enum ggml_backend_buffer_usage {
|
|
|
|
| 30 |
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
| 31 |
};
|
| 32 |
|
| 33 |
+
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
| 34 |
+
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
| 35 |
+
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
| 36 |
+
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
| 37 |
+
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 38 |
+
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 39 |
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 40 |
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
| 41 |
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
| 42 |
+
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
| 43 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
| 44 |
+
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
| 45 |
|
| 46 |
//
|
| 47 |
// Backend
|
|
|
|
| 58 |
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 59 |
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 60 |
|
| 61 |
+
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 62 |
+
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 63 |
|
| 64 |
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
| 65 |
|
|
|
|
| 80 |
|
| 81 |
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
| 82 |
|
| 83 |
+
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
| 84 |
+
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
| 85 |
|
| 86 |
// Create a backend buffer from an existing pointer
|
| 87 |
+
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
| 88 |
|
| 89 |
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
| 90 |
|
| 91 |
#ifdef GGML_USE_CPU_HBM
|
| 92 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
|
|
|
| 183 |
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
| 184 |
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
| 185 |
|
| 186 |
+
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
| 187 |
|
| 188 |
// Compare the output of two backends
|
| 189 |
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
ggml-cuda.cu
CHANGED
|
@@ -7615,11 +7615,11 @@ struct cuda_pool_alloc {
|
|
| 7615 |
|
| 7616 |
static bool g_cublas_loaded = false;
|
| 7617 |
|
| 7618 |
-
bool ggml_cublas_loaded(void) {
|
| 7619 |
return g_cublas_loaded;
|
| 7620 |
}
|
| 7621 |
|
| 7622 |
-
void ggml_init_cublas() {
|
| 7623 |
static bool initialized = false;
|
| 7624 |
|
| 7625 |
if (!initialized) {
|
|
@@ -7707,7 +7707,7 @@ void ggml_init_cublas() {
|
|
| 7707 |
}
|
| 7708 |
}
|
| 7709 |
|
| 7710 |
-
void * ggml_cuda_host_malloc(size_t size) {
|
| 7711 |
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
| 7712 |
return nullptr;
|
| 7713 |
}
|
|
@@ -7725,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
| 7725 |
return ptr;
|
| 7726 |
}
|
| 7727 |
|
| 7728 |
-
void ggml_cuda_host_free(void * ptr) {
|
| 7729 |
CUDA_CHECK(cudaFreeHost(ptr));
|
| 7730 |
}
|
| 7731 |
|
|
@@ -9242,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
| 9242 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
| 9243 |
}
|
| 9244 |
|
| 9245 |
-
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 9246 |
if (!g_cublas_loaded) return false;
|
| 9247 |
|
| 9248 |
const int64_t ne10 = src1->ne[0];
|
|
@@ -10013,7 +10013,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|
| 10013 |
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
| 10014 |
}
|
| 10015 |
|
| 10016 |
-
static void ggml_cuda_set_main_device(const int main_device) {
|
| 10017 |
if (main_device >= g_device_count) {
|
| 10018 |
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
| 10019 |
main_device, g_device_count, g_main_device);
|
|
@@ -10028,7 +10028,7 @@ static void ggml_cuda_set_main_device(const int main_device) {
|
|
| 10028 |
}
|
| 10029 |
}
|
| 10030 |
|
| 10031 |
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
| 10032 |
if (!g_cublas_loaded) return false;
|
| 10033 |
|
| 10034 |
ggml_cuda_func_t func;
|
|
@@ -10186,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
| 10186 |
return true;
|
| 10187 |
}
|
| 10188 |
|
| 10189 |
-
int ggml_cuda_get_device_count() {
|
| 10190 |
int device_count;
|
| 10191 |
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
| 10192 |
return 0;
|
|
@@ -10194,7 +10194,7 @@ int ggml_cuda_get_device_count() {
|
|
| 10194 |
return device_count;
|
| 10195 |
}
|
| 10196 |
|
| 10197 |
-
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
| 10198 |
cudaDeviceProp prop;
|
| 10199 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
| 10200 |
snprintf(description, description_size, "%s", prop.name);
|
|
@@ -10244,27 +10244,27 @@ struct ggml_backend_cuda_buffer_context {
|
|
| 10244 |
}
|
| 10245 |
};
|
| 10246 |
|
| 10247 |
-
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 10248 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10249 |
return ctx->name.c_str();
|
| 10250 |
}
|
| 10251 |
|
| 10252 |
-
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
| 10253 |
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
| 10254 |
}
|
| 10255 |
|
| 10256 |
-
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10257 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10258 |
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
| 10259 |
delete ctx;
|
| 10260 |
}
|
| 10261 |
|
| 10262 |
-
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 10263 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10264 |
return ctx->dev_ptr;
|
| 10265 |
}
|
| 10266 |
|
| 10267 |
-
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 10268 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10269 |
|
| 10270 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
@@ -10296,7 +10296,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
| 10296 |
}
|
| 10297 |
}
|
| 10298 |
|
| 10299 |
-
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10300 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10301 |
|
| 10302 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
@@ -10307,7 +10307,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|
| 10307 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10308 |
}
|
| 10309 |
|
| 10310 |
-
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10311 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10312 |
|
| 10313 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
@@ -10318,7 +10318,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
|
| 10318 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10319 |
}
|
| 10320 |
|
| 10321 |
-
static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10322 |
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 10323 |
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
| 10324 |
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
@@ -10335,7 +10335,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co
|
|
| 10335 |
return false;
|
| 10336 |
}
|
| 10337 |
|
| 10338 |
-
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 10339 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10340 |
|
| 10341 |
ggml_cuda_set_device(ctx->device);
|
|
@@ -10357,19 +10357,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
|
| 10357 |
};
|
| 10358 |
|
| 10359 |
// cuda buffer type
|
| 10360 |
-
|
| 10361 |
struct ggml_backend_cuda_buffer_type_context {
|
| 10362 |
int device;
|
| 10363 |
std::string name;
|
| 10364 |
};
|
| 10365 |
|
| 10366 |
-
static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10367 |
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 10368 |
|
| 10369 |
return ctx->name.c_str();
|
| 10370 |
}
|
| 10371 |
|
| 10372 |
-
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10373 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 10374 |
|
| 10375 |
ggml_cuda_set_device(buft_ctx->device);
|
|
@@ -10388,13 +10387,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
|
| 10388 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
| 10389 |
}
|
| 10390 |
|
| 10391 |
-
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 10392 |
return 128;
|
| 10393 |
|
| 10394 |
UNUSED(buft);
|
| 10395 |
}
|
| 10396 |
|
| 10397 |
-
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10398 |
int64_t row_low = 0;
|
| 10399 |
int64_t row_high = ggml_nrows(tensor);
|
| 10400 |
int64_t nrows_split = row_high - row_low;
|
|
@@ -10414,7 +10413,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
| 10414 |
UNUSED(buft);
|
| 10415 |
}
|
| 10416 |
|
| 10417 |
-
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 10418 |
if (!ggml_backend_is_cuda(backend)) {
|
| 10419 |
return false;
|
| 10420 |
}
|
|
@@ -10434,7 +10433,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
| 10434 |
/* .is_host = */ NULL,
|
| 10435 |
};
|
| 10436 |
|
| 10437 |
-
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
| 10438 |
// FIXME: this is not thread safe
|
| 10439 |
if (device >= ggml_backend_cuda_get_device_count()) {
|
| 10440 |
return nullptr;
|
|
@@ -10479,7 +10478,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
|
| 10479 |
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 10480 |
};
|
| 10481 |
|
| 10482 |
-
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 10483 |
return GGML_CUDA_NAME "_Split";
|
| 10484 |
|
| 10485 |
UNUSED(buffer);
|
|
@@ -10490,19 +10489,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_
|
|
| 10490 |
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
| 10491 |
//}
|
| 10492 |
|
| 10493 |
-
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10494 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 10495 |
delete ctx;
|
| 10496 |
}
|
| 10497 |
|
| 10498 |
-
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 10499 |
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
| 10500 |
return (void *)0x1000;
|
| 10501 |
|
| 10502 |
UNUSED(buffer);
|
| 10503 |
}
|
| 10504 |
|
| 10505 |
-
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 10506 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
| 10507 |
|
| 10508 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
@@ -10552,7 +10551,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
|
|
| 10552 |
tensor->extra = extra;
|
| 10553 |
}
|
| 10554 |
|
| 10555 |
-
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10556 |
// split tensors must always be set in their entirety at once
|
| 10557 |
GGML_ASSERT(offset == 0);
|
| 10558 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
@@ -10586,7 +10585,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
|
| 10586 |
}
|
| 10587 |
}
|
| 10588 |
|
| 10589 |
-
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10590 |
// split tensors must always be set in their entirety at once
|
| 10591 |
GGML_ASSERT(offset == 0);
|
| 10592 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
@@ -10620,7 +10619,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
|
| 10620 |
}
|
| 10621 |
}
|
| 10622 |
|
| 10623 |
-
static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 10624 |
UNUSED(buffer);
|
| 10625 |
UNUSED(value);
|
| 10626 |
}
|
|
@@ -10639,13 +10638,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|
| 10639 |
|
| 10640 |
// cuda split buffer type
|
| 10641 |
|
| 10642 |
-
static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10643 |
return GGML_CUDA_NAME "_Split";
|
| 10644 |
|
| 10645 |
UNUSED(buft);
|
| 10646 |
}
|
| 10647 |
|
| 10648 |
-
static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10649 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 10650 |
// instead, we allocate them for each tensor separately in init_tensor
|
| 10651 |
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
@@ -10655,13 +10654,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
|
|
| 10655 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
| 10656 |
}
|
| 10657 |
|
| 10658 |
-
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 10659 |
return 128;
|
| 10660 |
|
| 10661 |
UNUSED(buft);
|
| 10662 |
}
|
| 10663 |
|
| 10664 |
-
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10665 |
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
| 10666 |
|
| 10667 |
size_t total_size = 0;
|
|
@@ -10688,13 +10687,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu
|
|
| 10688 |
return total_size;
|
| 10689 |
}
|
| 10690 |
|
| 10691 |
-
static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 10692 |
return ggml_backend_is_cuda(backend);
|
| 10693 |
|
| 10694 |
UNUSED(buft);
|
| 10695 |
}
|
| 10696 |
|
| 10697 |
-
static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 10698 |
return false;
|
| 10699 |
|
| 10700 |
UNUSED(buft);
|
|
@@ -10709,7 +10708,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|
| 10709 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 10710 |
};
|
| 10711 |
|
| 10712 |
-
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
| 10713 |
// FIXME: this is not thread safe
|
| 10714 |
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
| 10715 |
|
|
@@ -10745,23 +10744,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
|
|
| 10745 |
|
| 10746 |
// host buffer type
|
| 10747 |
|
| 10748 |
-
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10749 |
return GGML_CUDA_NAME "_Host";
|
| 10750 |
|
| 10751 |
UNUSED(buft);
|
| 10752 |
}
|
| 10753 |
|
| 10754 |
-
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 10755 |
return GGML_CUDA_NAME "_Host";
|
| 10756 |
|
| 10757 |
UNUSED(buffer);
|
| 10758 |
}
|
| 10759 |
|
| 10760 |
-
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10761 |
ggml_cuda_host_free(buffer->context);
|
| 10762 |
}
|
| 10763 |
|
| 10764 |
-
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10765 |
void * ptr = ggml_cuda_host_malloc(size);
|
| 10766 |
|
| 10767 |
if (ptr == nullptr) {
|
|
@@ -10777,7 +10776,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
| 10777 |
return buffer;
|
| 10778 |
}
|
| 10779 |
|
| 10780 |
-
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
| 10781 |
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
| 10782 |
/* .iface = */ {
|
| 10783 |
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
|
@@ -10795,26 +10794,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
| 10795 |
|
| 10796 |
// backend
|
| 10797 |
|
| 10798 |
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
| 10799 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10800 |
|
| 10801 |
return cuda_ctx->name.c_str();
|
| 10802 |
}
|
| 10803 |
|
| 10804 |
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
| 10805 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10806 |
|
| 10807 |
delete cuda_ctx;
|
| 10808 |
delete backend;
|
| 10809 |
}
|
| 10810 |
|
| 10811 |
-
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
| 10812 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10813 |
|
| 10814 |
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
| 10815 |
}
|
| 10816 |
|
| 10817 |
-
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10818 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10819 |
|
| 10820 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
@@ -10823,7 +10822,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
| 10823 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
| 10824 |
}
|
| 10825 |
|
| 10826 |
-
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10827 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10828 |
|
| 10829 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
@@ -10832,7 +10831,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
| 10832 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
| 10833 |
}
|
| 10834 |
|
| 10835 |
-
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10836 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10837 |
|
| 10838 |
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
@@ -10843,7 +10842,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggm
|
|
| 10843 |
return false;
|
| 10844 |
}
|
| 10845 |
|
| 10846 |
-
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
| 10847 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10848 |
|
| 10849 |
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
|
@@ -10851,7 +10850,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
| 10851 |
UNUSED(backend);
|
| 10852 |
}
|
| 10853 |
|
| 10854 |
-
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 10855 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10856 |
|
| 10857 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
|
@@ -10890,7 +10889,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
| 10890 |
return true;
|
| 10891 |
}
|
| 10892 |
|
| 10893 |
-
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 10894 |
switch (op->op) {
|
| 10895 |
case GGML_OP_UNARY:
|
| 10896 |
switch (ggml_get_unary_op(op)) {
|
|
@@ -11016,7 +11015,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
| 11016 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
| 11017 |
};
|
| 11018 |
|
| 11019 |
-
ggml_backend_t ggml_backend_cuda_init(int device) {
|
| 11020 |
ggml_init_cublas(); // TODO: remove from ggml.c
|
| 11021 |
|
| 11022 |
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
|
@@ -11040,35 +11039,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
| 11040 |
return cuda_backend;
|
| 11041 |
}
|
| 11042 |
|
| 11043 |
-
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
| 11044 |
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
| 11045 |
}
|
| 11046 |
|
| 11047 |
-
int ggml_backend_cuda_get_device_count() {
|
| 11048 |
return ggml_cuda_get_device_count();
|
| 11049 |
}
|
| 11050 |
|
| 11051 |
-
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
| 11052 |
ggml_cuda_get_device_description(device, description, description_size);
|
| 11053 |
}
|
| 11054 |
|
| 11055 |
-
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
| 11056 |
ggml_cuda_set_device(device);
|
| 11057 |
|
| 11058 |
CUDA_CHECK(cudaMemGetInfo(free, total));
|
| 11059 |
}
|
| 11060 |
|
| 11061 |
// backend registry
|
| 11062 |
-
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
| 11063 |
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
| 11064 |
return cuda_backend;
|
| 11065 |
|
| 11066 |
UNUSED(params);
|
| 11067 |
}
|
| 11068 |
|
| 11069 |
-
extern "C" int ggml_backend_cuda_reg_devices();
|
| 11070 |
|
| 11071 |
-
int ggml_backend_cuda_reg_devices() {
|
| 11072 |
int device_count = ggml_cuda_get_device_count();
|
| 11073 |
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
| 11074 |
for (int i = 0; i < device_count; i++) {
|
|
|
|
| 7615 |
|
| 7616 |
static bool g_cublas_loaded = false;
|
| 7617 |
|
| 7618 |
+
GGML_CALL bool ggml_cublas_loaded(void) {
|
| 7619 |
return g_cublas_loaded;
|
| 7620 |
}
|
| 7621 |
|
| 7622 |
+
GGML_CALL void ggml_init_cublas() {
|
| 7623 |
static bool initialized = false;
|
| 7624 |
|
| 7625 |
if (!initialized) {
|
|
|
|
| 7707 |
}
|
| 7708 |
}
|
| 7709 |
|
| 7710 |
+
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
| 7711 |
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
| 7712 |
return nullptr;
|
| 7713 |
}
|
|
|
|
| 7725 |
return ptr;
|
| 7726 |
}
|
| 7727 |
|
| 7728 |
+
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
| 7729 |
CUDA_CHECK(cudaFreeHost(ptr));
|
| 7730 |
}
|
| 7731 |
|
|
|
|
| 9242 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
| 9243 |
}
|
| 9244 |
|
| 9245 |
+
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 9246 |
if (!g_cublas_loaded) return false;
|
| 9247 |
|
| 9248 |
const int64_t ne10 = src1->ne[0];
|
|
|
|
| 10013 |
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
| 10014 |
}
|
| 10015 |
|
| 10016 |
+
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
| 10017 |
if (main_device >= g_device_count) {
|
| 10018 |
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
| 10019 |
main_device, g_device_count, g_main_device);
|
|
|
|
| 10028 |
}
|
| 10029 |
}
|
| 10030 |
|
| 10031 |
+
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
| 10032 |
if (!g_cublas_loaded) return false;
|
| 10033 |
|
| 10034 |
ggml_cuda_func_t func;
|
|
|
|
| 10186 |
return true;
|
| 10187 |
}
|
| 10188 |
|
| 10189 |
+
GGML_CALL int ggml_cuda_get_device_count() {
|
| 10190 |
int device_count;
|
| 10191 |
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
| 10192 |
return 0;
|
|
|
|
| 10194 |
return device_count;
|
| 10195 |
}
|
| 10196 |
|
| 10197 |
+
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
| 10198 |
cudaDeviceProp prop;
|
| 10199 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
| 10200 |
snprintf(description, description_size, "%s", prop.name);
|
|
|
|
| 10244 |
}
|
| 10245 |
};
|
| 10246 |
|
| 10247 |
+
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 10248 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10249 |
return ctx->name.c_str();
|
| 10250 |
}
|
| 10251 |
|
| 10252 |
+
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
| 10253 |
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
| 10254 |
}
|
| 10255 |
|
| 10256 |
+
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10257 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10258 |
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
| 10259 |
delete ctx;
|
| 10260 |
}
|
| 10261 |
|
| 10262 |
+
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 10263 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10264 |
return ctx->dev_ptr;
|
| 10265 |
}
|
| 10266 |
|
| 10267 |
+
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 10268 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10269 |
|
| 10270 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
|
|
| 10296 |
}
|
| 10297 |
}
|
| 10298 |
|
| 10299 |
+
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10300 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10301 |
|
| 10302 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
| 10307 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10308 |
}
|
| 10309 |
|
| 10310 |
+
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10311 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10312 |
|
| 10313 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
| 10318 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10319 |
}
|
| 10320 |
|
| 10321 |
+
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10322 |
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 10323 |
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
| 10324 |
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
|
|
| 10335 |
return false;
|
| 10336 |
}
|
| 10337 |
|
| 10338 |
+
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 10339 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10340 |
|
| 10341 |
ggml_cuda_set_device(ctx->device);
|
|
|
|
| 10357 |
};
|
| 10358 |
|
| 10359 |
// cuda buffer type
|
|
|
|
| 10360 |
struct ggml_backend_cuda_buffer_type_context {
|
| 10361 |
int device;
|
| 10362 |
std::string name;
|
| 10363 |
};
|
| 10364 |
|
| 10365 |
+
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10366 |
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 10367 |
|
| 10368 |
return ctx->name.c_str();
|
| 10369 |
}
|
| 10370 |
|
| 10371 |
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10372 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 10373 |
|
| 10374 |
ggml_cuda_set_device(buft_ctx->device);
|
|
|
|
| 10387 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
| 10388 |
}
|
| 10389 |
|
| 10390 |
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 10391 |
return 128;
|
| 10392 |
|
| 10393 |
UNUSED(buft);
|
| 10394 |
}
|
| 10395 |
|
| 10396 |
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10397 |
int64_t row_low = 0;
|
| 10398 |
int64_t row_high = ggml_nrows(tensor);
|
| 10399 |
int64_t nrows_split = row_high - row_low;
|
|
|
|
| 10413 |
UNUSED(buft);
|
| 10414 |
}
|
| 10415 |
|
| 10416 |
+
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 10417 |
if (!ggml_backend_is_cuda(backend)) {
|
| 10418 |
return false;
|
| 10419 |
}
|
|
|
|
| 10433 |
/* .is_host = */ NULL,
|
| 10434 |
};
|
| 10435 |
|
| 10436 |
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
| 10437 |
// FIXME: this is not thread safe
|
| 10438 |
if (device >= ggml_backend_cuda_get_device_count()) {
|
| 10439 |
return nullptr;
|
|
|
|
| 10478 |
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 10479 |
};
|
| 10480 |
|
| 10481 |
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 10482 |
return GGML_CUDA_NAME "_Split";
|
| 10483 |
|
| 10484 |
UNUSED(buffer);
|
|
|
|
| 10489 |
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
| 10490 |
//}
|
| 10491 |
|
| 10492 |
+
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10493 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 10494 |
delete ctx;
|
| 10495 |
}
|
| 10496 |
|
| 10497 |
+
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 10498 |
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
| 10499 |
return (void *)0x1000;
|
| 10500 |
|
| 10501 |
UNUSED(buffer);
|
| 10502 |
}
|
| 10503 |
|
| 10504 |
+
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 10505 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
| 10506 |
|
| 10507 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
|
|
| 10551 |
tensor->extra = extra;
|
| 10552 |
}
|
| 10553 |
|
| 10554 |
+
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10555 |
// split tensors must always be set in their entirety at once
|
| 10556 |
GGML_ASSERT(offset == 0);
|
| 10557 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
| 10585 |
}
|
| 10586 |
}
|
| 10587 |
|
| 10588 |
+
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10589 |
// split tensors must always be set in their entirety at once
|
| 10590 |
GGML_ASSERT(offset == 0);
|
| 10591 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
| 10619 |
}
|
| 10620 |
}
|
| 10621 |
|
| 10622 |
+
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 10623 |
UNUSED(buffer);
|
| 10624 |
UNUSED(value);
|
| 10625 |
}
|
|
|
|
| 10638 |
|
| 10639 |
// cuda split buffer type
|
| 10640 |
|
| 10641 |
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10642 |
return GGML_CUDA_NAME "_Split";
|
| 10643 |
|
| 10644 |
UNUSED(buft);
|
| 10645 |
}
|
| 10646 |
|
| 10647 |
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10648 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 10649 |
// instead, we allocate them for each tensor separately in init_tensor
|
| 10650 |
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
|
|
| 10654 |
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
| 10655 |
}
|
| 10656 |
|
| 10657 |
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 10658 |
return 128;
|
| 10659 |
|
| 10660 |
UNUSED(buft);
|
| 10661 |
}
|
| 10662 |
|
| 10663 |
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10664 |
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
| 10665 |
|
| 10666 |
size_t total_size = 0;
|
|
|
|
| 10687 |
return total_size;
|
| 10688 |
}
|
| 10689 |
|
| 10690 |
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 10691 |
return ggml_backend_is_cuda(backend);
|
| 10692 |
|
| 10693 |
UNUSED(buft);
|
| 10694 |
}
|
| 10695 |
|
| 10696 |
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 10697 |
return false;
|
| 10698 |
|
| 10699 |
UNUSED(buft);
|
|
|
|
| 10708 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 10709 |
};
|
| 10710 |
|
| 10711 |
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
| 10712 |
// FIXME: this is not thread safe
|
| 10713 |
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
| 10714 |
|
|
|
|
| 10744 |
|
| 10745 |
// host buffer type
|
| 10746 |
|
| 10747 |
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10748 |
return GGML_CUDA_NAME "_Host";
|
| 10749 |
|
| 10750 |
UNUSED(buft);
|
| 10751 |
}
|
| 10752 |
|
| 10753 |
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 10754 |
return GGML_CUDA_NAME "_Host";
|
| 10755 |
|
| 10756 |
UNUSED(buffer);
|
| 10757 |
}
|
| 10758 |
|
| 10759 |
+
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10760 |
ggml_cuda_host_free(buffer->context);
|
| 10761 |
}
|
| 10762 |
|
| 10763 |
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10764 |
void * ptr = ggml_cuda_host_malloc(size);
|
| 10765 |
|
| 10766 |
if (ptr == nullptr) {
|
|
|
|
| 10776 |
return buffer;
|
| 10777 |
}
|
| 10778 |
|
| 10779 |
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
| 10780 |
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
| 10781 |
/* .iface = */ {
|
| 10782 |
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
|
|
|
| 10794 |
|
| 10795 |
// backend
|
| 10796 |
|
| 10797 |
+
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
| 10798 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10799 |
|
| 10800 |
return cuda_ctx->name.c_str();
|
| 10801 |
}
|
| 10802 |
|
| 10803 |
+
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
| 10804 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10805 |
|
| 10806 |
delete cuda_ctx;
|
| 10807 |
delete backend;
|
| 10808 |
}
|
| 10809 |
|
| 10810 |
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
| 10811 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10812 |
|
| 10813 |
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
| 10814 |
}
|
| 10815 |
|
| 10816 |
+
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10817 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10818 |
|
| 10819 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
|
|
| 10822 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
| 10823 |
}
|
| 10824 |
|
| 10825 |
+
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10826 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10827 |
|
| 10828 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
|
|
| 10831 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
| 10832 |
}
|
| 10833 |
|
| 10834 |
+
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10835 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10836 |
|
| 10837 |
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
|
|
| 10842 |
return false;
|
| 10843 |
}
|
| 10844 |
|
| 10845 |
+
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
| 10846 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10847 |
|
| 10848 |
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
|
|
|
| 10850 |
UNUSED(backend);
|
| 10851 |
}
|
| 10852 |
|
| 10853 |
+
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 10854 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10855 |
|
| 10856 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
|
|
|
| 10889 |
return true;
|
| 10890 |
}
|
| 10891 |
|
| 10892 |
+
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 10893 |
switch (op->op) {
|
| 10894 |
case GGML_OP_UNARY:
|
| 10895 |
switch (ggml_get_unary_op(op)) {
|
|
|
|
| 11015 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
| 11016 |
};
|
| 11017 |
|
| 11018 |
+
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
| 11019 |
ggml_init_cublas(); // TODO: remove from ggml.c
|
| 11020 |
|
| 11021 |
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
|
|
|
| 11039 |
return cuda_backend;
|
| 11040 |
}
|
| 11041 |
|
| 11042 |
+
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
| 11043 |
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
| 11044 |
}
|
| 11045 |
|
| 11046 |
+
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
| 11047 |
return ggml_cuda_get_device_count();
|
| 11048 |
}
|
| 11049 |
|
| 11050 |
+
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
| 11051 |
ggml_cuda_get_device_description(device, description, description_size);
|
| 11052 |
}
|
| 11053 |
|
| 11054 |
+
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
| 11055 |
ggml_cuda_set_device(device);
|
| 11056 |
|
| 11057 |
CUDA_CHECK(cudaMemGetInfo(free, total));
|
| 11058 |
}
|
| 11059 |
|
| 11060 |
// backend registry
|
| 11061 |
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
| 11062 |
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
| 11063 |
return cuda_backend;
|
| 11064 |
|
| 11065 |
UNUSED(params);
|
| 11066 |
}
|
| 11067 |
|
| 11068 |
+
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
| 11069 |
|
| 11070 |
+
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
| 11071 |
int device_count = ggml_cuda_get_device_count();
|
| 11072 |
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
| 11073 |
for (int i = 0; i < device_count; i++) {
|
ggml-cuda.h
CHANGED
|
@@ -18,34 +18,34 @@ extern "C" {
|
|
| 18 |
#define GGML_CUDA_MAX_DEVICES 16
|
| 19 |
|
| 20 |
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
| 21 |
-
GGML_API void ggml_init_cublas(void);
|
| 22 |
|
| 23 |
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
| 24 |
-
GGML_API bool ggml_cublas_loaded(void);
|
| 25 |
|
| 26 |
-
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
| 27 |
-
GGML_API void ggml_cuda_host_free(void * ptr);
|
| 28 |
|
| 29 |
-
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 30 |
-
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 31 |
|
| 32 |
-
GGML_API int ggml_cuda_get_device_count(void);
|
| 33 |
-
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 34 |
|
| 35 |
// backend API
|
| 36 |
-
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
| 37 |
|
| 38 |
-
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
| 39 |
|
| 40 |
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 41 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 42 |
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
| 43 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 44 |
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
| 45 |
|
| 46 |
-
GGML_API int ggml_backend_cuda_get_device_count(void);
|
| 47 |
-
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 48 |
-
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
| 49 |
|
| 50 |
#ifdef __cplusplus
|
| 51 |
}
|
|
|
|
| 18 |
#define GGML_CUDA_MAX_DEVICES 16
|
| 19 |
|
| 20 |
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
| 21 |
+
GGML_API GGML_CALL void ggml_init_cublas(void);
|
| 22 |
|
| 23 |
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
| 24 |
+
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
| 25 |
|
| 26 |
+
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
| 27 |
+
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
| 28 |
|
| 29 |
+
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 30 |
+
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 31 |
|
| 32 |
+
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
| 33 |
+
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 34 |
|
| 35 |
// backend API
|
| 36 |
+
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
| 37 |
|
| 38 |
+
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
| 39 |
|
| 40 |
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 41 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 42 |
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
| 43 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 44 |
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
| 45 |
|
| 46 |
+
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
| 47 |
+
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 48 |
+
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
| 49 |
|
| 50 |
#ifdef __cplusplus
|
| 51 |
}
|
ggml-metal.h
CHANGED
|
@@ -47,11 +47,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|
| 47 |
|
| 48 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
| 49 |
|
| 50 |
-
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
| 51 |
|
| 52 |
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
| 53 |
|
| 54 |
-
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 55 |
|
| 56 |
// helper to check if the device supports a specific family
|
| 57 |
// ideally, the user code should be doing these checks
|
|
|
|
| 47 |
|
| 48 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
| 49 |
|
| 50 |
+
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
| 51 |
|
| 52 |
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
| 53 |
|
| 54 |
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 55 |
|
| 56 |
// helper to check if the device supports a specific family
|
| 57 |
// ideally, the user code should be doing these checks
|
ggml-metal.m
CHANGED
|
@@ -2294,13 +2294,13 @@ static void ggml_backend_metal_free_device(void) {
|
|
| 2294 |
}
|
| 2295 |
}
|
| 2296 |
|
| 2297 |
-
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 2298 |
return "Metal";
|
| 2299 |
|
| 2300 |
UNUSED(buffer);
|
| 2301 |
}
|
| 2302 |
|
| 2303 |
-
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 2304 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2305 |
|
| 2306 |
for (int i = 0; i < ctx->n_buffers; i++) {
|
|
@@ -2315,25 +2315,25 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|
| 2315 |
free(ctx);
|
| 2316 |
}
|
| 2317 |
|
| 2318 |
-
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 2319 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2320 |
|
| 2321 |
return ctx->all_data;
|
| 2322 |
}
|
| 2323 |
|
| 2324 |
-
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2325 |
memcpy((char *)tensor->data + offset, data, size);
|
| 2326 |
|
| 2327 |
UNUSED(buffer);
|
| 2328 |
}
|
| 2329 |
|
| 2330 |
-
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 2331 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 2332 |
|
| 2333 |
UNUSED(buffer);
|
| 2334 |
}
|
| 2335 |
|
| 2336 |
-
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 2337 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 2338 |
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 2339 |
return true;
|
|
@@ -2343,7 +2343,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
|
|
| 2343 |
UNUSED(buffer);
|
| 2344 |
}
|
| 2345 |
|
| 2346 |
-
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 2347 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2348 |
|
| 2349 |
memset(ctx->all_data, value, ctx->all_size);
|
|
@@ -2363,13 +2363,13 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
|
| 2363 |
|
| 2364 |
// default buffer type
|
| 2365 |
|
| 2366 |
-
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 2367 |
return "Metal";
|
| 2368 |
|
| 2369 |
UNUSED(buft);
|
| 2370 |
}
|
| 2371 |
|
| 2372 |
-
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 2373 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2374 |
|
| 2375 |
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
@@ -2421,24 +2421,24 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|
| 2421 |
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
| 2422 |
}
|
| 2423 |
|
| 2424 |
-
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 2425 |
return 32;
|
| 2426 |
UNUSED(buft);
|
| 2427 |
}
|
| 2428 |
|
| 2429 |
-
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 2430 |
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
| 2431 |
|
| 2432 |
UNUSED(buft);
|
| 2433 |
}
|
| 2434 |
|
| 2435 |
-
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 2436 |
return true;
|
| 2437 |
|
| 2438 |
UNUSED(buft);
|
| 2439 |
}
|
| 2440 |
|
| 2441 |
-
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
| 2442 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
| 2443 |
/* .iface = */ {
|
| 2444 |
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
|
@@ -2456,7 +2456,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
| 2456 |
|
| 2457 |
// buffer from ptr
|
| 2458 |
|
| 2459 |
-
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
| 2460 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2461 |
|
| 2462 |
ctx->all_data = data;
|
|
@@ -2543,31 +2543,31 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
|
| 2543 |
|
| 2544 |
// backend
|
| 2545 |
|
| 2546 |
-
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
| 2547 |
return "Metal";
|
| 2548 |
|
| 2549 |
UNUSED(backend);
|
| 2550 |
}
|
| 2551 |
|
| 2552 |
-
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
| 2553 |
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
| 2554 |
ggml_metal_free(ctx);
|
| 2555 |
free(backend);
|
| 2556 |
}
|
| 2557 |
|
| 2558 |
-
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
| 2559 |
return ggml_backend_metal_buffer_type();
|
| 2560 |
|
| 2561 |
UNUSED(backend);
|
| 2562 |
}
|
| 2563 |
|
| 2564 |
-
static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 2565 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2566 |
|
| 2567 |
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
| 2568 |
}
|
| 2569 |
|
| 2570 |
-
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 2571 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2572 |
|
| 2573 |
return ggml_metal_supports_op(metal_ctx, op);
|
|
@@ -2630,9 +2630,9 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
|
| 2630 |
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
| 2631 |
}
|
| 2632 |
|
| 2633 |
-
ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
| 2634 |
|
| 2635 |
-
ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
| 2636 |
return ggml_backend_metal_init();
|
| 2637 |
|
| 2638 |
GGML_UNUSED(params);
|
|
|
|
| 2294 |
}
|
| 2295 |
}
|
| 2296 |
|
| 2297 |
+
GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 2298 |
return "Metal";
|
| 2299 |
|
| 2300 |
UNUSED(buffer);
|
| 2301 |
}
|
| 2302 |
|
| 2303 |
+
GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 2304 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2305 |
|
| 2306 |
for (int i = 0; i < ctx->n_buffers; i++) {
|
|
|
|
| 2315 |
free(ctx);
|
| 2316 |
}
|
| 2317 |
|
| 2318 |
+
GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 2319 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2320 |
|
| 2321 |
return ctx->all_data;
|
| 2322 |
}
|
| 2323 |
|
| 2324 |
+
GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2325 |
memcpy((char *)tensor->data + offset, data, size);
|
| 2326 |
|
| 2327 |
UNUSED(buffer);
|
| 2328 |
}
|
| 2329 |
|
| 2330 |
+
GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 2331 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 2332 |
|
| 2333 |
UNUSED(buffer);
|
| 2334 |
}
|
| 2335 |
|
| 2336 |
+
GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 2337 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 2338 |
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 2339 |
return true;
|
|
|
|
| 2343 |
UNUSED(buffer);
|
| 2344 |
}
|
| 2345 |
|
| 2346 |
+
GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 2347 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2348 |
|
| 2349 |
memset(ctx->all_data, value, ctx->all_size);
|
|
|
|
| 2363 |
|
| 2364 |
// default buffer type
|
| 2365 |
|
| 2366 |
+
GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 2367 |
return "Metal";
|
| 2368 |
|
| 2369 |
UNUSED(buft);
|
| 2370 |
}
|
| 2371 |
|
| 2372 |
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 2373 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2374 |
|
| 2375 |
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
|
|
| 2421 |
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
| 2422 |
}
|
| 2423 |
|
| 2424 |
+
GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 2425 |
return 32;
|
| 2426 |
UNUSED(buft);
|
| 2427 |
}
|
| 2428 |
|
| 2429 |
+
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 2430 |
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
| 2431 |
|
| 2432 |
UNUSED(buft);
|
| 2433 |
}
|
| 2434 |
|
| 2435 |
+
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 2436 |
return true;
|
| 2437 |
|
| 2438 |
UNUSED(buft);
|
| 2439 |
}
|
| 2440 |
|
| 2441 |
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
| 2442 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
| 2443 |
/* .iface = */ {
|
| 2444 |
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
|
|
|
| 2456 |
|
| 2457 |
// buffer from ptr
|
| 2458 |
|
| 2459 |
+
GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
| 2460 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2461 |
|
| 2462 |
ctx->all_data = data;
|
|
|
|
| 2543 |
|
| 2544 |
// backend
|
| 2545 |
|
| 2546 |
+
GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
| 2547 |
return "Metal";
|
| 2548 |
|
| 2549 |
UNUSED(backend);
|
| 2550 |
}
|
| 2551 |
|
| 2552 |
+
GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
|
| 2553 |
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
| 2554 |
ggml_metal_free(ctx);
|
| 2555 |
free(backend);
|
| 2556 |
}
|
| 2557 |
|
| 2558 |
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
| 2559 |
return ggml_backend_metal_buffer_type();
|
| 2560 |
|
| 2561 |
UNUSED(backend);
|
| 2562 |
}
|
| 2563 |
|
| 2564 |
+
GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 2565 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2566 |
|
| 2567 |
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
| 2568 |
}
|
| 2569 |
|
| 2570 |
+
GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 2571 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2572 |
|
| 2573 |
return ggml_metal_supports_op(metal_ctx, op);
|
|
|
|
| 2630 |
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
| 2631 |
}
|
| 2632 |
|
| 2633 |
+
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
| 2634 |
|
| 2635 |
+
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
| 2636 |
return ggml_backend_metal_init();
|
| 2637 |
|
| 2638 |
GGML_UNUSED(params);
|
ggml.c
CHANGED
|
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
| 1990 |
GGML_PRINT("%s: --- end ---\n", __func__);
|
| 1991 |
}
|
| 1992 |
|
| 1993 |
-
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
| 1994 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1995 |
|
| 1996 |
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 1997 |
}
|
| 1998 |
|
| 1999 |
-
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
| 2000 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2001 |
|
| 2002 |
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 2003 |
}
|
| 2004 |
|
| 2005 |
-
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
| 2006 |
size_t nbytes;
|
| 2007 |
size_t blck_size = ggml_blck_size(tensor->type);
|
| 2008 |
if (blck_size == 1) {
|
|
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
| 2025 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 2026 |
}
|
| 2027 |
|
| 2028 |
-
int ggml_blck_size(enum ggml_type type) {
|
| 2029 |
return type_traits[type].blck_size;
|
| 2030 |
}
|
| 2031 |
|
| 2032 |
-
size_t ggml_type_size(enum ggml_type type) {
|
| 2033 |
return type_traits[type].type_size;
|
| 2034 |
}
|
| 2035 |
|
| 2036 |
-
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
| 2037 |
assert(ne % ggml_blck_size(type) == 0);
|
| 2038 |
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
| 2039 |
}
|
|
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
|
| 2042 |
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
| 2043 |
}
|
| 2044 |
|
| 2045 |
-
const char * ggml_type_name(enum ggml_type type) {
|
| 2046 |
return type_traits[type].type_name;
|
| 2047 |
}
|
| 2048 |
|
| 2049 |
-
bool ggml_is_quantized(enum ggml_type type) {
|
| 2050 |
return type_traits[type].is_quantized;
|
| 2051 |
}
|
| 2052 |
|
| 2053 |
-
const char * ggml_op_name(enum ggml_op op) {
|
| 2054 |
return GGML_OP_NAME[op];
|
| 2055 |
}
|
| 2056 |
|
|
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
|
| 2062 |
return GGML_UNARY_OP_NAME[op];
|
| 2063 |
}
|
| 2064 |
|
| 2065 |
-
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
| 2066 |
if (t->op == GGML_OP_UNARY) {
|
| 2067 |
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
| 2068 |
return ggml_unary_op_name(uop);
|
|
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|
| 2072 |
}
|
| 2073 |
}
|
| 2074 |
|
| 2075 |
-
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
| 2076 |
return ggml_type_size(tensor->type);
|
| 2077 |
}
|
| 2078 |
|
|
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
|
|
| 2154 |
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
| 2155 |
}
|
| 2156 |
|
| 2157 |
-
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
| 2158 |
return tensor->nb[0] > tensor->nb[1];
|
| 2159 |
}
|
| 2160 |
|
| 2161 |
-
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
| 2162 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2163 |
|
| 2164 |
return
|
|
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
| 2177 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 2178 |
}
|
| 2179 |
|
| 2180 |
-
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
| 2181 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2182 |
|
| 2183 |
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
|
@@ -3079,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
| 3079 |
return (float *)(tensor->data);
|
| 3080 |
}
|
| 3081 |
|
| 3082 |
-
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
| 3083 |
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
| 3084 |
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
| 3085 |
}
|
|
@@ -11653,7 +11653,7 @@ static void ggml_rope_cache_init(
|
|
| 11653 |
}
|
| 11654 |
}
|
| 11655 |
|
| 11656 |
-
void ggml_rope_yarn_corr_dims(
|
| 11657 |
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
| 11658 |
) {
|
| 11659 |
// start and end correction dims
|
|
|
|
| 1990 |
GGML_PRINT("%s: --- end ---\n", __func__);
|
| 1991 |
}
|
| 1992 |
|
| 1993 |
+
GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
| 1994 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1995 |
|
| 1996 |
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 1997 |
}
|
| 1998 |
|
| 1999 |
+
GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
| 2000 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2001 |
|
| 2002 |
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
| 2003 |
}
|
| 2004 |
|
| 2005 |
+
GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
| 2006 |
size_t nbytes;
|
| 2007 |
size_t blck_size = ggml_blck_size(tensor->type);
|
| 2008 |
if (blck_size == 1) {
|
|
|
|
| 2025 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 2026 |
}
|
| 2027 |
|
| 2028 |
+
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
| 2029 |
return type_traits[type].blck_size;
|
| 2030 |
}
|
| 2031 |
|
| 2032 |
+
GGML_CALL size_t ggml_type_size(enum ggml_type type) {
|
| 2033 |
return type_traits[type].type_size;
|
| 2034 |
}
|
| 2035 |
|
| 2036 |
+
GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
| 2037 |
assert(ne % ggml_blck_size(type) == 0);
|
| 2038 |
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
| 2039 |
}
|
|
|
|
| 2042 |
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
| 2043 |
}
|
| 2044 |
|
| 2045 |
+
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
| 2046 |
return type_traits[type].type_name;
|
| 2047 |
}
|
| 2048 |
|
| 2049 |
+
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
| 2050 |
return type_traits[type].is_quantized;
|
| 2051 |
}
|
| 2052 |
|
| 2053 |
+
GGML_CALL const char * ggml_op_name(enum ggml_op op) {
|
| 2054 |
return GGML_OP_NAME[op];
|
| 2055 |
}
|
| 2056 |
|
|
|
|
| 2062 |
return GGML_UNARY_OP_NAME[op];
|
| 2063 |
}
|
| 2064 |
|
| 2065 |
+
GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
| 2066 |
if (t->op == GGML_OP_UNARY) {
|
| 2067 |
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
| 2068 |
return ggml_unary_op_name(uop);
|
|
|
|
| 2072 |
}
|
| 2073 |
}
|
| 2074 |
|
| 2075 |
+
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
| 2076 |
return ggml_type_size(tensor->type);
|
| 2077 |
}
|
| 2078 |
|
|
|
|
| 2154 |
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
| 2155 |
}
|
| 2156 |
|
| 2157 |
+
GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
| 2158 |
return tensor->nb[0] > tensor->nb[1];
|
| 2159 |
}
|
| 2160 |
|
| 2161 |
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
| 2162 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2163 |
|
| 2164 |
return
|
|
|
|
| 2177 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 2178 |
}
|
| 2179 |
|
| 2180 |
+
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
| 2181 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2182 |
|
| 2183 |
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
|
|
|
| 3079 |
return (float *)(tensor->data);
|
| 3080 |
}
|
| 3081 |
|
| 3082 |
+
GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
| 3083 |
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
| 3084 |
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
| 3085 |
}
|
|
|
|
| 11653 |
}
|
| 11654 |
}
|
| 11655 |
|
| 11656 |
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
| 11657 |
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
| 11658 |
) {
|
| 11659 |
// start and end correction dims
|
ggml.h
CHANGED
|
@@ -187,6 +187,16 @@
|
|
| 187 |
# define GGML_API
|
| 188 |
#endif
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
// TODO: support for clang
|
| 191 |
#ifdef __GNUC__
|
| 192 |
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
@@ -649,41 +659,41 @@ extern "C" {
|
|
| 649 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
| 650 |
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
| 651 |
|
| 652 |
-
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
| 653 |
-
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
| 654 |
-
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 655 |
-
GGML_API
|
| 656 |
|
| 657 |
-
GGML_API int ggml_blck_size(enum ggml_type type);
|
| 658 |
-
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
| 659 |
-
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
| 660 |
|
| 661 |
GGML_DEPRECATED(
|
| 662 |
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
| 663 |
"use ggml_row_size() instead");
|
| 664 |
|
| 665 |
-
GGML_API const char * ggml_type_name(enum ggml_type type);
|
| 666 |
-
GGML_API const char * ggml_op_name (enum ggml_op op);
|
| 667 |
-
GGML_API
|
| 668 |
|
| 669 |
-
GGML_API
|
| 670 |
-
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
| 671 |
|
| 672 |
-
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
| 673 |
|
| 674 |
-
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
| 675 |
|
| 676 |
// TODO: temporary until model loading of ggml examples is refactored
|
| 677 |
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
| 678 |
|
| 679 |
-
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
| 680 |
-
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
| 681 |
-
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
| 682 |
-
GGML_API
|
| 683 |
-
GGML_API
|
| 684 |
-
GGML_API
|
| 685 |
-
GGML_API
|
| 686 |
-
GGML_API
|
| 687 |
|
| 688 |
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 689 |
|
|
@@ -770,7 +780,7 @@ extern "C" {
|
|
| 770 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
| 771 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
| 772 |
|
| 773 |
-
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
| 774 |
|
| 775 |
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
| 776 |
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
|
@@ -1413,7 +1423,7 @@ extern "C" {
|
|
| 1413 |
float beta_slow);
|
| 1414 |
|
| 1415 |
// compute correction dims for YaRN RoPE scaling
|
| 1416 |
-
void ggml_rope_yarn_corr_dims(
|
| 1417 |
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
| 1418 |
|
| 1419 |
// xPos RoPE, in-place, returns view(a)
|
|
|
|
| 187 |
# define GGML_API
|
| 188 |
#endif
|
| 189 |
|
| 190 |
+
#ifdef GGML_MULTIPLATFORM
|
| 191 |
+
# if defined(_WIN32)
|
| 192 |
+
# define GGML_CALL
|
| 193 |
+
# else
|
| 194 |
+
# define GGML_CALL __attribute__((__ms_abi__))
|
| 195 |
+
# endif
|
| 196 |
+
#else
|
| 197 |
+
# define GGML_CALL
|
| 198 |
+
#endif
|
| 199 |
+
|
| 200 |
// TODO: support for clang
|
| 201 |
#ifdef __GNUC__
|
| 202 |
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
|
|
| 659 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
| 660 |
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
| 661 |
|
| 662 |
+
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
| 663 |
+
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
| 664 |
+
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 665 |
+
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
| 666 |
|
| 667 |
+
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
| 668 |
+
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
| 669 |
+
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
| 670 |
|
| 671 |
GGML_DEPRECATED(
|
| 672 |
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
| 673 |
"use ggml_row_size() instead");
|
| 674 |
|
| 675 |
+
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
| 676 |
+
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
| 677 |
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
| 678 |
|
| 679 |
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
| 680 |
+
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
| 681 |
|
| 682 |
+
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
| 683 |
|
| 684 |
+
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
| 685 |
|
| 686 |
// TODO: temporary until model loading of ggml examples is refactored
|
| 687 |
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
| 688 |
|
| 689 |
+
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
| 690 |
+
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
| 691 |
+
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
| 692 |
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
| 693 |
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
| 694 |
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
| 695 |
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
| 696 |
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
| 697 |
|
| 698 |
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 699 |
|
|
|
|
| 780 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
| 781 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
| 782 |
|
| 783 |
+
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
| 784 |
|
| 785 |
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
| 786 |
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
|
|
|
| 1423 |
float beta_slow);
|
| 1424 |
|
| 1425 |
// compute correction dims for YaRN RoPE scaling
|
| 1426 |
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
| 1427 |
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
| 1428 |
|
| 1429 |
// xPos RoPE, in-place, returns view(a)
|