Spaces:
Running
Running
sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)
Browse files* sync : ggml
* sync : llama.cpp
* talk-llama : fix obsolete param
* ggml-alloc : fix ggml_tallocr_is_own
* talk.wasm : update to new ggml
* ggml : fix type punning in ggml_scale
* ggml : cuda jetson + arm quants warnings
- examples/talk-llama/llama.cpp +0 -0
- examples/talk-llama/llama.h +114 -12
- examples/talk-llama/talk-llama.cpp +0 -1
- examples/talk.wasm/gpt-2.cpp +20 -21
- examples/talk/gpt-2.cpp +20 -21
- extra/sync-llama.sh +5 -0
- ggml-alloc.c +13 -5
- ggml-backend-impl.h +12 -8
- ggml-backend.c +75 -5
- ggml-backend.h +7 -0
- ggml-cuda.cu +303 -169
- ggml-metal.h +3 -0
- ggml-metal.m +190 -44
- ggml-metal.metal +11 -2
- ggml-quants.c +2 -2
- ggml.c +326 -150
- ggml.h +37 -16
- whisper.cpp +12 -30
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -39,10 +39,11 @@
|
|
| 39 |
|
| 40 |
#define LLAMA_MAX_RNG_STATE (64*1024)
|
| 41 |
|
|
|
|
| 42 |
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 43 |
|
| 44 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 45 |
-
#define LLAMA_SESSION_VERSION
|
| 46 |
|
| 47 |
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
| 48 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
@@ -126,7 +127,7 @@ extern "C" {
|
|
| 126 |
bool sorted;
|
| 127 |
} llama_token_data_array;
|
| 128 |
|
| 129 |
-
typedef
|
| 130 |
|
| 131 |
// Input data for llama_decode
|
| 132 |
// A llama_batch object can contain input about one or many sequences
|
|
@@ -158,16 +159,38 @@ extern "C" {
|
|
| 158 |
llama_seq_id all_seq_id; // used if seq_id == NULL
|
| 159 |
} llama_batch;
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
struct llama_model_params {
|
| 162 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 163 |
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
| 164 |
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
| 165 |
|
| 166 |
-
//
|
|
|
|
|
|
|
| 167 |
llama_progress_callback progress_callback;
|
|
|
|
| 168 |
// context pointer passed to the progress callback
|
| 169 |
void * progress_callback_user_data;
|
| 170 |
|
|
|
|
|
|
|
|
|
|
| 171 |
// Keep the booleans together to avoid misalignment during copy-by-value.
|
| 172 |
bool vocab_only; // only load the vocabulary, no weights
|
| 173 |
bool use_mmap; // use mmap if possible
|
|
@@ -185,17 +208,20 @@ extern "C" {
|
|
| 185 |
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
| 186 |
float rope_freq_base; // RoPE base frequency, 0 = from model
|
| 187 |
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
| 188 |
-
float yarn_ext_factor; // YaRN extrapolation mix factor,
|
| 189 |
float yarn_attn_factor; // YaRN magnitude scaling factor
|
| 190 |
float yarn_beta_fast; // YaRN low correction dim
|
| 191 |
float yarn_beta_slow; // YaRN high correction dim
|
| 192 |
uint32_t yarn_orig_ctx; // YaRN original context size
|
| 193 |
|
|
|
|
|
|
|
|
|
|
| 194 |
// Keep the booleans together to avoid misalignment during copy-by-value.
|
| 195 |
-
bool mul_mat_q;
|
| 196 |
-
bool
|
| 197 |
-
bool
|
| 198 |
-
bool
|
| 199 |
};
|
| 200 |
|
| 201 |
// model quantization parameters
|
|
@@ -290,7 +316,9 @@ extern "C" {
|
|
| 290 |
|
| 291 |
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
| 292 |
|
| 293 |
-
|
|
|
|
|
|
|
| 294 |
|
| 295 |
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
| 296 |
|
|
@@ -301,6 +329,23 @@ extern "C" {
|
|
| 301 |
// Get the model's RoPE frequency scaling factor
|
| 302 |
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
// Get a string describing the model type
|
| 305 |
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
| 306 |
|
|
@@ -344,9 +389,60 @@ extern "C" {
|
|
| 344 |
// KV cache
|
| 345 |
//
|
| 346 |
|
| 347 |
-
//
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
|
| 351 |
// Clear the KV cache
|
| 352 |
LLAMA_API void llama_kv_cache_clear(
|
|
@@ -517,6 +613,12 @@ extern "C" {
|
|
| 517 |
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
| 518 |
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
| 519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
// codellama infill tokens
|
| 521 |
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
| 522 |
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
|
|
|
| 39 |
|
| 40 |
#define LLAMA_MAX_RNG_STATE (64*1024)
|
| 41 |
|
| 42 |
+
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
| 43 |
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 44 |
|
| 45 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 46 |
+
#define LLAMA_SESSION_VERSION 3
|
| 47 |
|
| 48 |
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
| 49 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
|
|
| 127 |
bool sorted;
|
| 128 |
} llama_token_data_array;
|
| 129 |
|
| 130 |
+
typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
| 131 |
|
| 132 |
// Input data for llama_decode
|
| 133 |
// A llama_batch object can contain input about one or many sequences
|
|
|
|
| 159 |
llama_seq_id all_seq_id; // used if seq_id == NULL
|
| 160 |
} llama_batch;
|
| 161 |
|
| 162 |
+
enum llama_model_kv_override_type {
|
| 163 |
+
LLAMA_KV_OVERRIDE_INT,
|
| 164 |
+
LLAMA_KV_OVERRIDE_FLOAT,
|
| 165 |
+
LLAMA_KV_OVERRIDE_BOOL,
|
| 166 |
+
};
|
| 167 |
+
|
| 168 |
+
struct llama_model_kv_override {
|
| 169 |
+
char key[128];
|
| 170 |
+
enum llama_model_kv_override_type tag;
|
| 171 |
+
union {
|
| 172 |
+
int64_t int_value;
|
| 173 |
+
double float_value;
|
| 174 |
+
bool bool_value;
|
| 175 |
+
};
|
| 176 |
+
};
|
| 177 |
+
|
| 178 |
struct llama_model_params {
|
| 179 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 180 |
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
| 181 |
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
| 182 |
|
| 183 |
+
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
| 184 |
+
// If the provided progress_callback returns true, model loading continues.
|
| 185 |
+
// If it returns false, model loading is immediately aborted.
|
| 186 |
llama_progress_callback progress_callback;
|
| 187 |
+
|
| 188 |
// context pointer passed to the progress callback
|
| 189 |
void * progress_callback_user_data;
|
| 190 |
|
| 191 |
+
// override key-value pairs of the model meta data
|
| 192 |
+
const struct llama_model_kv_override * kv_overrides;
|
| 193 |
+
|
| 194 |
// Keep the booleans together to avoid misalignment during copy-by-value.
|
| 195 |
bool vocab_only; // only load the vocabulary, no weights
|
| 196 |
bool use_mmap; // use mmap if possible
|
|
|
|
| 208 |
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
| 209 |
float rope_freq_base; // RoPE base frequency, 0 = from model
|
| 210 |
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
| 211 |
+
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
| 212 |
float yarn_attn_factor; // YaRN magnitude scaling factor
|
| 213 |
float yarn_beta_fast; // YaRN low correction dim
|
| 214 |
float yarn_beta_slow; // YaRN high correction dim
|
| 215 |
uint32_t yarn_orig_ctx; // YaRN original context size
|
| 216 |
|
| 217 |
+
enum ggml_type type_k; // data type for K cache
|
| 218 |
+
enum ggml_type type_v; // data type for V cache
|
| 219 |
+
|
| 220 |
// Keep the booleans together to avoid misalignment during copy-by-value.
|
| 221 |
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
| 222 |
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
| 223 |
+
bool embedding; // embedding mode only
|
| 224 |
+
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
| 225 |
};
|
| 226 |
|
| 227 |
// model quantization parameters
|
|
|
|
| 316 |
|
| 317 |
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
| 318 |
|
| 319 |
+
// TODO: become more consistent with returned int types across the API
|
| 320 |
+
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
| 321 |
+
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
| 322 |
|
| 323 |
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
| 324 |
|
|
|
|
| 329 |
// Get the model's RoPE frequency scaling factor
|
| 330 |
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
| 331 |
|
| 332 |
+
// Functions to access the model's GGUF metadata scalar values
|
| 333 |
+
// - The functions return the length of the string on success, or -1 on failure
|
| 334 |
+
// - The output string is always null-terminated and cleared on failure
|
| 335 |
+
// - GGUF array values are not supported by these functions
|
| 336 |
+
|
| 337 |
+
// Get metadata value as a string by key name
|
| 338 |
+
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
| 339 |
+
|
| 340 |
+
// Get the number of metadata key/value pairs
|
| 341 |
+
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
| 342 |
+
|
| 343 |
+
// Get metadata key name by index
|
| 344 |
+
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
| 345 |
+
|
| 346 |
+
// Get metadata value as a string by index
|
| 347 |
+
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
| 348 |
+
|
| 349 |
// Get a string describing the model type
|
| 350 |
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
| 351 |
|
|
|
|
| 389 |
// KV cache
|
| 390 |
//
|
| 391 |
|
| 392 |
+
// Information associated with an individual cell in the KV cache view.
|
| 393 |
+
struct llama_kv_cache_view_cell {
|
| 394 |
+
// The position for this cell. Takes KV cache shifts into account.
|
| 395 |
+
// May be negative if the cell is not populated.
|
| 396 |
+
llama_pos pos;
|
| 397 |
+
};
|
| 398 |
+
|
| 399 |
+
// An updateable view of the KV cache.
|
| 400 |
+
struct llama_kv_cache_view {
|
| 401 |
+
// Number of KV cache cells. This will be the same as the context size.
|
| 402 |
+
int32_t n_cells;
|
| 403 |
+
|
| 404 |
+
// Maximum number of sequences that can exist in a cell. It's not an error
|
| 405 |
+
// if there are more sequences in a cell than this value, however they will
|
| 406 |
+
// not be visible in the view cells_sequences.
|
| 407 |
+
int32_t n_max_seq;
|
| 408 |
+
|
| 409 |
+
// Number of tokens in the cache. For example, if there are two populated
|
| 410 |
+
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
| 411 |
+
// ids then you'll have 3 tokens.
|
| 412 |
+
int32_t token_count;
|
| 413 |
+
|
| 414 |
+
// Number of populated cache cells.
|
| 415 |
+
int32_t used_cells;
|
| 416 |
+
|
| 417 |
+
// Maximum contiguous empty slots in the cache.
|
| 418 |
+
int32_t max_contiguous;
|
| 419 |
+
|
| 420 |
+
// Index to the start of the max_contiguous slot range. Can be negative
|
| 421 |
+
// when cache is full.
|
| 422 |
+
int32_t max_contiguous_idx;
|
| 423 |
+
|
| 424 |
+
// Information for an individual cell.
|
| 425 |
+
struct llama_kv_cache_view_cell * cells;
|
| 426 |
+
|
| 427 |
+
// The sequences for each cell. There will be n_max_seq items per cell.
|
| 428 |
+
llama_seq_id * cells_sequences;
|
| 429 |
+
};
|
| 430 |
+
|
| 431 |
+
// Create an empty KV cache view. (use only for debugging purposes)
|
| 432 |
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
| 433 |
+
|
| 434 |
+
// Free a KV cache view. (use only for debugging purposes)
|
| 435 |
+
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
| 436 |
+
|
| 437 |
+
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
| 438 |
+
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
| 439 |
+
|
| 440 |
+
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
| 441 |
+
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
| 442 |
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
| 443 |
+
|
| 444 |
+
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
| 445 |
+
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
| 446 |
|
| 447 |
// Clear the KV cache
|
| 448 |
LLAMA_API void llama_kv_cache_clear(
|
|
|
|
| 613 |
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
| 614 |
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
| 615 |
|
| 616 |
+
// Returns -1 if unknown, 1 for true or 0 for false.
|
| 617 |
+
LLAMA_API int llama_add_bos_token(const struct llama_model * model);
|
| 618 |
+
|
| 619 |
+
// Returns -1 if unknown, 1 for true or 0 for false.
|
| 620 |
+
LLAMA_API int llama_add_eos_token(const struct llama_model * model);
|
| 621 |
+
|
| 622 |
// codellama infill tokens
|
| 623 |
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
| 624 |
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -282,7 +282,6 @@ int main(int argc, char ** argv) {
|
|
| 282 |
// tune these to your liking
|
| 283 |
lcparams.n_ctx = 2048;
|
| 284 |
lcparams.seed = 1;
|
| 285 |
-
lcparams.f16_kv = true;
|
| 286 |
lcparams.n_threads = params.n_threads;
|
| 287 |
|
| 288 |
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
|
|
|
|
| 282 |
// tune these to your liking
|
| 283 |
lcparams.n_ctx = 2048;
|
| 284 |
lcparams.seed = 1;
|
|
|
|
| 285 |
lcparams.n_threads = params.n_threads;
|
| 286 |
|
| 287 |
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
|
examples/talk.wasm/gpt-2.cpp
CHANGED
|
@@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 155 |
const int n_ctx = hparams.n_ctx;
|
| 156 |
const int n_vocab = hparams.n_vocab;
|
| 157 |
|
| 158 |
-
ctx_size +=
|
| 159 |
-
ctx_size +=
|
| 160 |
|
| 161 |
-
ctx_size += n_vocab*
|
| 162 |
-
ctx_size += n_ctx*
|
| 163 |
-
ctx_size += n_vocab*
|
| 164 |
|
| 165 |
-
ctx_size += n_layer*(
|
| 166 |
-
ctx_size += n_layer*(
|
| 167 |
|
| 168 |
-
ctx_size += n_layer*(
|
| 169 |
-
ctx_size += n_layer*(
|
| 170 |
|
| 171 |
-
ctx_size += n_layer*(3*n_embd*n_embd
|
| 172 |
-
ctx_size += n_layer*(
|
| 173 |
|
| 174 |
-
ctx_size += n_layer*(n_embd*n_embd
|
| 175 |
-
ctx_size += n_layer*(
|
| 176 |
|
| 177 |
-
ctx_size += n_layer*(4*n_embd*n_embd
|
| 178 |
-
ctx_size += n_layer*(
|
| 179 |
|
| 180 |
-
ctx_size += n_layer*(4*n_embd*n_embd
|
| 181 |
-
ctx_size += n_layer*(
|
| 182 |
|
| 183 |
-
ctx_size += n_ctx*n_layer*
|
| 184 |
-
ctx_size += n_ctx*n_layer*
|
| 185 |
|
| 186 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 187 |
|
|
@@ -524,8 +524,7 @@ bool gpt2_eval(
|
|
| 524 |
struct ggml_tensor * KQ_scaled =
|
| 525 |
ggml_scale(ctx0,
|
| 526 |
KQ,
|
| 527 |
-
|
| 528 |
-
);
|
| 529 |
|
| 530 |
// KQ_masked = mask_past(KQ_scaled)
|
| 531 |
// [n_past + N, N, 12]
|
|
|
|
| 155 |
const int n_ctx = hparams.n_ctx;
|
| 156 |
const int n_vocab = hparams.n_vocab;
|
| 157 |
|
| 158 |
+
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
| 159 |
+
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
| 160 |
|
| 161 |
+
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
|
| 162 |
+
ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
|
| 163 |
+
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
|
| 164 |
|
| 165 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
| 166 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
| 167 |
|
| 168 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
| 169 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
| 170 |
|
| 171 |
+
ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
| 172 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
| 173 |
|
| 174 |
+
ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
| 175 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
| 176 |
|
| 177 |
+
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
| 178 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
| 179 |
|
| 180 |
+
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
| 181 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
|
| 182 |
|
| 183 |
+
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
| 184 |
+
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
| 185 |
|
| 186 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 187 |
|
|
|
|
| 524 |
struct ggml_tensor * KQ_scaled =
|
| 525 |
ggml_scale(ctx0,
|
| 526 |
KQ,
|
| 527 |
+
1.0f/sqrt(float(n_embd)/n_head));
|
|
|
|
| 528 |
|
| 529 |
// KQ_masked = mask_past(KQ_scaled)
|
| 530 |
// [n_past + N, N, 12]
|
examples/talk/gpt-2.cpp
CHANGED
|
@@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 155 |
const int n_ctx = hparams.n_ctx;
|
| 156 |
const int n_vocab = hparams.n_vocab;
|
| 157 |
|
| 158 |
-
ctx_size +=
|
| 159 |
-
ctx_size +=
|
| 160 |
|
| 161 |
-
ctx_size += n_vocab*
|
| 162 |
-
ctx_size += n_ctx*
|
| 163 |
-
ctx_size += n_vocab*
|
| 164 |
|
| 165 |
-
ctx_size += n_layer*(
|
| 166 |
-
ctx_size += n_layer*(
|
| 167 |
|
| 168 |
-
ctx_size += n_layer*(
|
| 169 |
-
ctx_size += n_layer*(
|
| 170 |
|
| 171 |
-
ctx_size += n_layer*(3*n_embd*n_embd
|
| 172 |
-
ctx_size += n_layer*(
|
| 173 |
|
| 174 |
-
ctx_size += n_layer*(n_embd*n_embd
|
| 175 |
-
ctx_size += n_layer*(
|
| 176 |
|
| 177 |
-
ctx_size += n_layer*(4*n_embd*n_embd
|
| 178 |
-
ctx_size += n_layer*(
|
| 179 |
|
| 180 |
-
ctx_size += n_layer*(4*n_embd*n_embd
|
| 181 |
-
ctx_size += n_layer*(
|
| 182 |
|
| 183 |
-
ctx_size += n_ctx*n_layer*
|
| 184 |
-
ctx_size += n_ctx*n_layer*
|
| 185 |
|
| 186 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 187 |
|
|
@@ -525,8 +525,7 @@ bool gpt2_eval(
|
|
| 525 |
struct ggml_tensor * KQ_scaled =
|
| 526 |
ggml_scale(ctx0,
|
| 527 |
KQ,
|
| 528 |
-
|
| 529 |
-
);
|
| 530 |
|
| 531 |
// KQ_masked = mask_past(KQ_scaled)
|
| 532 |
// [n_past + N, N, 12]
|
|
|
|
| 155 |
const int n_ctx = hparams.n_ctx;
|
| 156 |
const int n_vocab = hparams.n_vocab;
|
| 157 |
|
| 158 |
+
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
| 159 |
+
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
| 160 |
|
| 161 |
+
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
|
| 162 |
+
ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
|
| 163 |
+
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
|
| 164 |
|
| 165 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
| 166 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
| 167 |
|
| 168 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
| 169 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
| 170 |
|
| 171 |
+
ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
| 172 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
| 173 |
|
| 174 |
+
ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
| 175 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
| 176 |
|
| 177 |
+
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
| 178 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
| 179 |
|
| 180 |
+
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
| 181 |
+
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
|
| 182 |
|
| 183 |
+
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
| 184 |
+
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
| 185 |
|
| 186 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 187 |
|
|
|
|
| 525 |
struct ggml_tensor * KQ_scaled =
|
| 526 |
ggml_scale(ctx0,
|
| 527 |
KQ,
|
| 528 |
+
1.0f/sqrt(float(n_embd)/n_head));
|
|
|
|
| 529 |
|
| 530 |
// KQ_masked = mask_past(KQ_scaled)
|
| 531 |
// [n_past + N, N, 12]
|
extra/sync-llama.sh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
|
| 4 |
+
cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
|
| 5 |
+
cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
|
ggml-alloc.c
CHANGED
|
@@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t
|
|
| 72 |
|
| 73 |
// check if a tensor is allocated by this buffer
|
| 74 |
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
| 75 |
-
return tensor->buffer == alloc->buffer;
|
| 76 |
}
|
| 77 |
|
| 78 |
static bool ggml_is_view(struct ggml_tensor * t) {
|
|
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|
| 449 |
if (update_backend) {
|
| 450 |
view->backend = view->view_src->backend;
|
| 451 |
}
|
| 452 |
-
|
|
|
|
| 453 |
view->data = (char *)view->view_src->data + view->view_offs;
|
| 454 |
|
| 455 |
-
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
| 456 |
-
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
| 457 |
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
| 458 |
|
| 459 |
if (!alloc->measure) {
|
|
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
|
| 736 |
}
|
| 737 |
|
| 738 |
void ggml_allocr_free(ggml_allocr_t alloc) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
ggml_gallocr_free(alloc->galloc);
|
| 740 |
ggml_tallocr_free(alloc->talloc);
|
| 741 |
free(alloc);
|
|
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 775 |
}
|
| 776 |
|
| 777 |
if (nbytes == 0) {
|
| 778 |
-
|
| 779 |
return NULL;
|
| 780 |
}
|
| 781 |
|
|
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 789 |
} else {
|
| 790 |
ggml_backend_view_init(buffer, t);
|
| 791 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
}
|
| 793 |
}
|
| 794 |
|
|
|
|
| 72 |
|
| 73 |
// check if a tensor is allocated by this buffer
|
| 74 |
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
| 75 |
+
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
|
| 76 |
}
|
| 77 |
|
| 78 |
static bool ggml_is_view(struct ggml_tensor * t) {
|
|
|
|
| 449 |
if (update_backend) {
|
| 450 |
view->backend = view->view_src->backend;
|
| 451 |
}
|
| 452 |
+
// views are initialized in the alloc buffer rather than the view_src buffer
|
| 453 |
+
view->buffer = alloc->buffer;
|
| 454 |
view->data = (char *)view->view_src->data + view->view_offs;
|
| 455 |
|
|
|
|
|
|
|
| 456 |
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
| 457 |
|
| 458 |
if (!alloc->measure) {
|
|
|
|
| 735 |
}
|
| 736 |
|
| 737 |
void ggml_allocr_free(ggml_allocr_t alloc) {
|
| 738 |
+
if (alloc == NULL) {
|
| 739 |
+
return;
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
ggml_gallocr_free(alloc->galloc);
|
| 743 |
ggml_tallocr_free(alloc->talloc);
|
| 744 |
free(alloc);
|
|
|
|
| 778 |
}
|
| 779 |
|
| 780 |
if (nbytes == 0) {
|
| 781 |
+
// all the tensors in the context are already allocated
|
| 782 |
return NULL;
|
| 783 |
}
|
| 784 |
|
|
|
|
| 792 |
} else {
|
| 793 |
ggml_backend_view_init(buffer, t);
|
| 794 |
}
|
| 795 |
+
} else {
|
| 796 |
+
if (t->view_src != NULL) {
|
| 797 |
+
// view of a pre-allocated tensor
|
| 798 |
+
ggml_backend_view_init(buffer, t);
|
| 799 |
+
}
|
| 800 |
}
|
| 801 |
}
|
| 802 |
|
ggml-backend-impl.h
CHANGED
|
@@ -20,6 +20,9 @@ extern "C" {
|
|
| 20 |
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
| 21 |
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
| 22 |
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
|
|
|
|
|
|
|
|
|
| 23 |
};
|
| 24 |
|
| 25 |
struct ggml_backend_buffer_type {
|
|
@@ -31,15 +34,16 @@ extern "C" {
|
|
| 31 |
typedef void * ggml_backend_buffer_context_t;
|
| 32 |
|
| 33 |
struct ggml_backend_buffer_i {
|
| 34 |
-
void
|
| 35 |
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
| 36 |
-
void *
|
| 37 |
-
void
|
| 38 |
-
void
|
| 39 |
-
void
|
| 40 |
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
| 41 |
-
void
|
| 42 |
-
void
|
|
|
|
| 43 |
};
|
| 44 |
|
| 45 |
struct ggml_backend_buffer {
|
|
@@ -78,7 +82,7 @@ extern "C" {
|
|
| 78 |
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 79 |
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 80 |
|
| 81 |
-
void (*synchronize)
|
| 82 |
|
| 83 |
// compute graph with a plan
|
| 84 |
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
|
|
| 20 |
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
| 21 |
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
| 22 |
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
| 23 |
+
// check if tensor data is in host memory
|
| 24 |
+
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
| 25 |
+
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
| 26 |
};
|
| 27 |
|
| 28 |
struct ggml_backend_buffer_type {
|
|
|
|
| 34 |
typedef void * ggml_backend_buffer_context_t;
|
| 35 |
|
| 36 |
struct ggml_backend_buffer_i {
|
| 37 |
+
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
| 38 |
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
| 39 |
+
void * (*get_base) (ggml_backend_buffer_t buffer);
|
| 40 |
+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 41 |
+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 42 |
+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 43 |
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
| 44 |
+
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 45 |
+
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 46 |
+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
| 47 |
};
|
| 48 |
|
| 49 |
struct ggml_backend_buffer {
|
|
|
|
| 82 |
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 83 |
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 84 |
|
| 85 |
+
void (*synchronize)(ggml_backend_t backend);
|
| 86 |
|
| 87 |
// compute graph with a plan
|
| 88 |
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
ggml-backend.c
CHANGED
|
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
|
|
| 35 |
return buft->iface.supports_backend(buft, backend);
|
| 36 |
}
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
// backend buffer
|
| 39 |
|
| 40 |
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
| 94 |
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
| 95 |
}
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
| 98 |
return buffer->buft;
|
| 99 |
}
|
|
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
| 378 |
|
| 379 |
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 380 |
free(buffer->context);
|
| 381 |
-
GGML_UNUSED(buffer);
|
| 382 |
}
|
| 383 |
|
| 384 |
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
|
|
| 411 |
GGML_UNUSED(buffer);
|
| 412 |
}
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
| 415 |
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 416 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
| 419 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 420 |
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
| 421 |
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
|
|
|
| 422 |
};
|
| 423 |
|
| 424 |
// for buffers from ptr, free is not called
|
|
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
| 430 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 431 |
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
| 432 |
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
|
|
|
| 433 |
};
|
| 434 |
|
| 435 |
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
|
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
|
|
| 455 |
GGML_UNUSED(buft);
|
| 456 |
}
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 459 |
-
static struct ggml_backend_buffer_type
|
| 460 |
/* .iface = */ {
|
| 461 |
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
| 462 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 463 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 464 |
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
|
|
| 465 |
},
|
| 466 |
/* .context = */ NULL,
|
| 467 |
};
|
| 468 |
|
| 469 |
-
return &
|
| 470 |
}
|
| 471 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
struct ggml_backend_cpu_context {
|
| 473 |
int n_threads;
|
| 474 |
void * work_data;
|
|
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|
| 505 |
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
| 506 |
|
| 507 |
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
| 508 |
-
cpu_plan->cgraph = *cgraph;
|
| 509 |
|
| 510 |
if (cpu_plan->cplan.work_size > 0) {
|
| 511 |
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
|
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
| 1180 |
// utils
|
| 1181 |
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 1182 |
GGML_ASSERT(tensor->buffer == NULL);
|
| 1183 |
-
GGML_ASSERT(tensor->data == NULL);
|
| 1184 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1185 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1186 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
|
|
| 35 |
return buft->iface.supports_backend(buft, backend);
|
| 36 |
}
|
| 37 |
|
| 38 |
+
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
| 39 |
+
if (buft->iface.is_host) {
|
| 40 |
+
return buft->iface.is_host(buft);
|
| 41 |
+
}
|
| 42 |
+
return false;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
// backend buffer
|
| 46 |
|
| 47 |
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
|
|
| 101 |
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
| 102 |
}
|
| 103 |
|
| 104 |
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 105 |
+
buffer->iface.clear(buffer, value);
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
| 109 |
+
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
| 113 |
return buffer->buft;
|
| 114 |
}
|
|
|
|
| 393 |
|
| 394 |
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 395 |
free(buffer->context);
|
|
|
|
| 396 |
}
|
| 397 |
|
| 398 |
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
| 425 |
GGML_UNUSED(buffer);
|
| 426 |
}
|
| 427 |
|
| 428 |
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 429 |
+
memset(buffer->context, value, buffer->size);
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
| 433 |
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 434 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
|
|
| 437 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 438 |
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
| 439 |
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
| 440 |
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 441 |
};
|
| 442 |
|
| 443 |
// for buffers from ptr, free is not called
|
|
|
|
| 449 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 450 |
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
| 451 |
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
| 452 |
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 453 |
};
|
| 454 |
|
| 455 |
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
|
|
|
| 475 |
GGML_UNUSED(buft);
|
| 476 |
}
|
| 477 |
|
| 478 |
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 479 |
+
return true;
|
| 480 |
+
|
| 481 |
+
GGML_UNUSED(buft);
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 485 |
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 486 |
/* .iface = */ {
|
| 487 |
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
| 488 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 489 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 490 |
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
| 491 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 492 |
},
|
| 493 |
/* .context = */ NULL,
|
| 494 |
};
|
| 495 |
|
| 496 |
+
return &ggml_backend_cpu_buffer_type;
|
| 497 |
}
|
| 498 |
|
| 499 |
+
#ifdef GGML_USE_CPU_HBM
|
| 500 |
+
|
| 501 |
+
// buffer type HBM
|
| 502 |
+
|
| 503 |
+
#include <hbwmalloc.h>
|
| 504 |
+
|
| 505 |
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 506 |
+
hbw_free(buffer->context);
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 510 |
+
//void * ptr = hbw_malloc(size);
|
| 511 |
+
void * ptr;
|
| 512 |
+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 513 |
+
if (result != 0) {
|
| 514 |
+
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
| 515 |
+
return NULL;
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
| 519 |
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 520 |
+
buffer->buft = buft;
|
| 521 |
+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 522 |
+
|
| 523 |
+
return buffer;
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
| 527 |
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
| 528 |
+
/* .iface = */ {
|
| 529 |
+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
| 530 |
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 531 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 532 |
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
| 533 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 534 |
+
},
|
| 535 |
+
/* .context = */ NULL,
|
| 536 |
+
};
|
| 537 |
+
|
| 538 |
+
return &ggml_backend_cpu_buffer_type_hbm;
|
| 539 |
+
}
|
| 540 |
+
#endif
|
| 541 |
+
|
| 542 |
struct ggml_backend_cpu_context {
|
| 543 |
int n_threads;
|
| 544 |
void * work_data;
|
|
|
|
| 575 |
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
| 576 |
|
| 577 |
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
| 578 |
+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
| 579 |
|
| 580 |
if (cpu_plan->cplan.work_size > 0) {
|
| 581 |
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
|
|
|
| 1250 |
// utils
|
| 1251 |
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 1252 |
GGML_ASSERT(tensor->buffer == NULL);
|
| 1253 |
+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
| 1254 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1255 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1256 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
ggml-backend.h
CHANGED
|
@@ -21,6 +21,7 @@ extern "C" {
|
|
| 21 |
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 22 |
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 23 |
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
|
|
|
| 24 |
|
| 25 |
// buffer
|
| 26 |
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
|
@@ -29,6 +30,8 @@ extern "C" {
|
|
| 29 |
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 30 |
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 31 |
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
|
|
|
|
|
| 32 |
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
| 33 |
|
| 34 |
//
|
|
@@ -76,6 +79,10 @@ extern "C" {
|
|
| 76 |
|
| 77 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
//
|
| 80 |
// Backend registry
|
| 81 |
//
|
|
|
|
| 21 |
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 22 |
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 23 |
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
| 24 |
+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 25 |
|
| 26 |
// buffer
|
| 27 |
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
|
|
|
| 30 |
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 31 |
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 32 |
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 33 |
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
| 34 |
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
| 35 |
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
| 36 |
|
| 37 |
//
|
|
|
|
| 79 |
|
| 80 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
| 81 |
|
| 82 |
+
#ifdef GGML_USE_CPU_HBM
|
| 83 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
| 84 |
+
#endif
|
| 85 |
+
|
| 86 |
//
|
| 87 |
// Backend registry
|
| 88 |
//
|
ggml-cuda.cu
CHANGED
|
@@ -31,6 +31,7 @@
|
|
| 31 |
#define CUDA_R_16F HIPBLAS_R_16F
|
| 32 |
#define CUDA_R_32F HIPBLAS_R_32F
|
| 33 |
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
|
|
|
| 34 |
#define cublasCreate hipblasCreate
|
| 35 |
#define cublasGemmEx hipblasGemmEx
|
| 36 |
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
|
@@ -40,6 +41,7 @@
|
|
| 40 |
#define cublasSetStream hipblasSetStream
|
| 41 |
#define cublasSgemm hipblasSgemm
|
| 42 |
#define cublasStatus_t hipblasStatus_t
|
|
|
|
| 43 |
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
| 44 |
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
| 45 |
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
@@ -58,8 +60,13 @@
|
|
| 58 |
#define cudaGetDeviceProperties hipGetDeviceProperties
|
| 59 |
#define cudaGetErrorString hipGetErrorString
|
| 60 |
#define cudaGetLastError hipGetLastError
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
#define cudaMalloc hipMalloc
|
| 62 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
|
|
| 63 |
#define cudaMemcpy hipMemcpy
|
| 64 |
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
| 65 |
#define cudaMemcpyAsync hipMemcpyAsync
|
|
@@ -78,10 +85,18 @@
|
|
| 78 |
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
| 79 |
#define cudaStream_t hipStream_t
|
| 80 |
#define cudaSuccess hipSuccess
|
|
|
|
| 81 |
#else
|
| 82 |
#include <cuda_runtime.h>
|
| 83 |
#include <cublas_v2.h>
|
| 84 |
#include <cuda_fp16.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 86 |
|
| 87 |
#include "ggml-cuda.h"
|
|
@@ -510,6 +525,14 @@ static size_t g_scratch_offset = 0;
|
|
| 510 |
|
| 511 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 514 |
#pragma unroll
|
| 515 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -1970,8 +1993,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
| 1970 |
// second part effectively subtracts 8 from each quant value
|
| 1971 |
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
| 1972 |
#else
|
| 1973 |
-
|
| 1974 |
-
return 0.0f; // only to satisfy the compiler
|
| 1975 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 1976 |
}
|
| 1977 |
|
|
@@ -2008,8 +2030,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
| 2008 |
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
| 2009 |
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
| 2010 |
#else
|
| 2011 |
-
|
| 2012 |
-
return 0.0f; // only to satisfy the compiler
|
| 2013 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2014 |
}
|
| 2015 |
|
|
@@ -2044,8 +2065,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
| 2044 |
// second part effectively subtracts 16 from each quant value
|
| 2045 |
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
| 2046 |
#else
|
| 2047 |
-
|
| 2048 |
-
return 0.0f; // only to satisfy the compiler
|
| 2049 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2050 |
}
|
| 2051 |
|
|
@@ -2090,8 +2110,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
| 2090 |
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
| 2091 |
|
| 2092 |
#else
|
| 2093 |
-
|
| 2094 |
-
return 0.0f; // only to satisfy the compiler
|
| 2095 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2096 |
}
|
| 2097 |
|
|
@@ -2112,8 +2131,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
| 2112 |
|
| 2113 |
return d8_0*d8_1 * sumi;
|
| 2114 |
#else
|
| 2115 |
-
|
| 2116 |
-
return 0.0f; // only to satisfy the compiler
|
| 2117 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2118 |
}
|
| 2119 |
|
|
@@ -2143,8 +2161,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
| 2143 |
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
| 2144 |
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
| 2145 |
#else
|
| 2146 |
-
|
| 2147 |
-
return 0.0f; // only to satisfy the compiler
|
| 2148 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2149 |
}
|
| 2150 |
|
|
@@ -2179,8 +2196,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
| 2179 |
|
| 2180 |
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
| 2181 |
#else
|
| 2182 |
-
|
| 2183 |
-
return 0.0f; // only to satisfy the compiler
|
| 2184 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2185 |
}
|
| 2186 |
|
|
@@ -2217,8 +2233,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
| 2217 |
|
| 2218 |
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
| 2219 |
#else
|
| 2220 |
-
|
| 2221 |
-
return 0.0f; // only to satisfy the compiler
|
| 2222 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2223 |
}
|
| 2224 |
|
|
@@ -2258,8 +2273,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
| 2258 |
|
| 2259 |
return d3 * sumf;
|
| 2260 |
#else
|
| 2261 |
-
|
| 2262 |
-
return 0.0f; // only to satisfy the compiler
|
| 2263 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2264 |
}
|
| 2265 |
|
|
@@ -2284,8 +2298,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
| 2284 |
|
| 2285 |
return d3*d8 * sumi;
|
| 2286 |
#else
|
| 2287 |
-
|
| 2288 |
-
return 0.0f; // only to satisfy the compiler
|
| 2289 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2290 |
}
|
| 2291 |
|
|
@@ -2318,8 +2331,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
| 2318 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2319 |
|
| 2320 |
#else
|
| 2321 |
-
|
| 2322 |
-
return 0.0f; // only to satisfy the compiler
|
| 2323 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2324 |
}
|
| 2325 |
|
|
@@ -2352,8 +2364,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
| 2352 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2353 |
|
| 2354 |
#else
|
| 2355 |
-
|
| 2356 |
-
return 0.0f; // only to satisfy the compiler
|
| 2357 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2358 |
}
|
| 2359 |
|
|
@@ -2393,8 +2404,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
| 2393 |
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
| 2394 |
|
| 2395 |
#else
|
| 2396 |
-
|
| 2397 |
-
return 0.0f; // only to satisfy the compiler
|
| 2398 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2399 |
}
|
| 2400 |
|
|
@@ -2427,8 +2437,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
| 2427 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2428 |
|
| 2429 |
#else
|
| 2430 |
-
|
| 2431 |
-
return 0.0f; // only to satisfy the compiler
|
| 2432 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2433 |
}
|
| 2434 |
|
|
@@ -2458,8 +2467,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
| 2458 |
|
| 2459 |
return d*sumf;
|
| 2460 |
#else
|
| 2461 |
-
|
| 2462 |
-
return 0.0f; // only to satisfy the compiler
|
| 2463 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2464 |
}
|
| 2465 |
|
|
@@ -2490,8 +2498,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
| 2490 |
return d6 * sumf_d;
|
| 2491 |
|
| 2492 |
#else
|
| 2493 |
-
|
| 2494 |
-
return 0.0f; // only to satisfy the compiler
|
| 2495 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2496 |
}
|
| 2497 |
|
|
@@ -3357,8 +3364,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
| 3357 |
return dall * sumf_d - dmin * sumf_m;
|
| 3358 |
|
| 3359 |
#else
|
| 3360 |
-
|
| 3361 |
-
return 0.0f; // only to satisfy the compiler
|
| 3362 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 3363 |
|
| 3364 |
#endif
|
|
@@ -3541,8 +3547,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
| 3541 |
return d * sumf_d;
|
| 3542 |
|
| 3543 |
#else
|
| 3544 |
-
|
| 3545 |
-
return 0.0f; // only to satisfy the compiler
|
| 3546 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 3547 |
|
| 3548 |
#endif
|
|
@@ -3952,7 +3957,7 @@ template <bool need_check> static __global__ void
|
|
| 3952 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 3953 |
#else
|
| 3954 |
(void) vec_dot_q4_0_q8_1_mul_mat;
|
| 3955 |
-
|
| 3956 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 3957 |
}
|
| 3958 |
|
|
@@ -4021,7 +4026,7 @@ template <bool need_check> static __global__ void
|
|
| 4021 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4022 |
#else
|
| 4023 |
(void) vec_dot_q4_1_q8_1_mul_mat;
|
| 4024 |
-
|
| 4025 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4026 |
}
|
| 4027 |
|
|
@@ -4088,7 +4093,7 @@ template <bool need_check> static __global__ void
|
|
| 4088 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4089 |
#else
|
| 4090 |
(void) vec_dot_q5_0_q8_1_mul_mat;
|
| 4091 |
-
|
| 4092 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4093 |
}
|
| 4094 |
|
|
@@ -4155,7 +4160,7 @@ mul_mat_q5_1(
|
|
| 4155 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4156 |
#else
|
| 4157 |
(void) vec_dot_q5_1_q8_1_mul_mat;
|
| 4158 |
-
|
| 4159 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4160 |
}
|
| 4161 |
|
|
@@ -4222,7 +4227,7 @@ template <bool need_check> static __global__ void
|
|
| 4222 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4223 |
#else
|
| 4224 |
(void) vec_dot_q8_0_q8_1_mul_mat;
|
| 4225 |
-
|
| 4226 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4227 |
}
|
| 4228 |
|
|
@@ -4289,7 +4294,7 @@ mul_mat_q2_K(
|
|
| 4289 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4290 |
#else
|
| 4291 |
(void) vec_dot_q2_K_q8_1_mul_mat;
|
| 4292 |
-
|
| 4293 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4294 |
}
|
| 4295 |
|
|
@@ -4358,7 +4363,7 @@ template <bool need_check> static __global__ void
|
|
| 4358 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4359 |
#else
|
| 4360 |
(void) vec_dot_q3_K_q8_1_mul_mat;
|
| 4361 |
-
|
| 4362 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4363 |
}
|
| 4364 |
|
|
@@ -4427,7 +4432,7 @@ template <bool need_check> static __global__ void
|
|
| 4427 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4428 |
#else
|
| 4429 |
(void) vec_dot_q4_K_q8_1_mul_mat;
|
| 4430 |
-
|
| 4431 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4432 |
}
|
| 4433 |
|
|
@@ -4494,7 +4499,7 @@ mul_mat_q5_K(
|
|
| 4494 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4495 |
#else
|
| 4496 |
(void) vec_dot_q5_K_q8_1_mul_mat;
|
| 4497 |
-
|
| 4498 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4499 |
}
|
| 4500 |
|
|
@@ -4563,7 +4568,7 @@ template <bool need_check> static __global__ void
|
|
| 4563 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4564 |
#else
|
| 4565 |
(void) vec_dot_q6_K_q8_1_mul_mat;
|
| 4566 |
-
|
| 4567 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4568 |
}
|
| 4569 |
|
|
@@ -4998,7 +5003,16 @@ static __global__ void rope_neox(
|
|
| 4998 |
const int ib = col / n_dims;
|
| 4999 |
const int ic = col % n_dims;
|
| 5000 |
|
| 5001 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5002 |
const int i2 = row/p_delta_rows;
|
| 5003 |
|
| 5004 |
float cur_rot = inv_ndims * ic - ib;
|
|
@@ -5259,17 +5273,17 @@ static __global__ void im2col_f32_f16(
|
|
| 5259 |
const int ky = (i - kd) / OW;
|
| 5260 |
const int ix = i % OW;
|
| 5261 |
|
| 5262 |
-
const
|
| 5263 |
-
const
|
| 5264 |
|
| 5265 |
-
const
|
| 5266 |
(blockIdx.y * OW + ix) * CHW +
|
| 5267 |
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
| 5268 |
|
| 5269 |
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 5270 |
dst[offset_dst] = __float2half(0.0f);
|
| 5271 |
} else {
|
| 5272 |
-
const
|
| 5273 |
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
| 5274 |
}
|
| 5275 |
}
|
|
@@ -6814,6 +6828,7 @@ static void ggml_cuda_op_get_rows(
|
|
| 6814 |
break;
|
| 6815 |
default:
|
| 6816 |
// TODO: k-quants
|
|
|
|
| 6817 |
GGML_ASSERT(false);
|
| 6818 |
break;
|
| 6819 |
}
|
|
@@ -7057,6 +7072,7 @@ inline void ggml_cuda_op_upscale(
|
|
| 7057 |
|
| 7058 |
(void) src1;
|
| 7059 |
(void) dst;
|
|
|
|
| 7060 |
}
|
| 7061 |
|
| 7062 |
inline void ggml_cuda_op_pad(
|
|
@@ -7073,6 +7089,7 @@ inline void ggml_cuda_op_pad(
|
|
| 7073 |
|
| 7074 |
(void) src1;
|
| 7075 |
(void) dst;
|
|
|
|
| 7076 |
}
|
| 7077 |
|
| 7078 |
inline void ggml_cuda_op_rms_norm(
|
|
@@ -7376,7 +7393,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
| 7376 |
|
| 7377 |
const int compute_capability = g_compute_capabilities[id];
|
| 7378 |
|
| 7379 |
-
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
| 7380 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 7381 |
half * src0_as_f16 = nullptr;
|
| 7382 |
size_t src0_as = 0;
|
|
@@ -7690,17 +7707,10 @@ inline void ggml_cuda_op_scale(
|
|
| 7690 |
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
| 7691 |
|
| 7692 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 7693 |
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 7694 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 7695 |
|
| 7696 |
float scale;
|
| 7697 |
-
|
| 7698 |
-
if (src1->backend == GGML_BACKEND_CPU) {
|
| 7699 |
-
scale = ((float *) src1->data)[0];
|
| 7700 |
-
} else {
|
| 7701 |
-
// TODO: pass pointer to kernel instead of copying to host
|
| 7702 |
-
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
| 7703 |
-
}
|
| 7704 |
|
| 7705 |
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
| 7706 |
CUDA_CHECK(cudaGetLastError());
|
|
@@ -7747,8 +7757,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 7747 |
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
| 7748 |
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
| 7749 |
|
| 7750 |
-
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
| 7751 |
-
|
| 7752 |
// dd = data device
|
| 7753 |
float * src0_ddf = nullptr;
|
| 7754 |
float * src1_ddf = nullptr;
|
|
@@ -7769,7 +7777,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 7769 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
| 7770 |
}
|
| 7771 |
|
| 7772 |
-
if (use_src1
|
| 7773 |
if (src1_on_device) {
|
| 7774 |
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
| 7775 |
} else {
|
|
@@ -7817,6 +7825,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
| 7817 |
}
|
| 7818 |
|
| 7819 |
#ifdef NDEBUG
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7820 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7821 |
CUDA_CHECK(ggml_cuda_set_device(id));
|
| 7822 |
|
|
@@ -7868,8 +7881,6 @@ static void ggml_cuda_op_mul_mat(
|
|
| 7868 |
const int nb2 = dst->nb[2];
|
| 7869 |
const int nb3 = dst->nb[3];
|
| 7870 |
|
| 7871 |
-
ggml_cuda_set_peer_access(ne11);
|
| 7872 |
-
|
| 7873 |
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
| 7874 |
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
| 7875 |
|
|
@@ -8300,27 +8311,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
| 8300 |
}
|
| 8301 |
|
| 8302 |
static __global__ void k_compute_batched_ptrs(
|
| 8303 |
-
const half * src0_as_f16, const half * src1_as_f16,
|
| 8304 |
const void ** ptrs_src, void ** ptrs_dst,
|
| 8305 |
-
|
| 8306 |
-
|
| 8307 |
-
|
| 8308 |
-
|
| 8309 |
-
|
| 8310 |
-
|
| 8311 |
-
|
| 8312 |
-
|
| 8313 |
|
| 8314 |
if (i13 >= ne13 || i12 >= ne12) {
|
| 8315 |
return;
|
| 8316 |
}
|
| 8317 |
|
| 8318 |
-
|
| 8319 |
-
|
| 8320 |
|
| 8321 |
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
| 8322 |
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
| 8323 |
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *)
|
| 8324 |
}
|
| 8325 |
|
| 8326 |
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -8376,7 +8387,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8376 |
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
| 8377 |
|
| 8378 |
size_t dst_as = 0;
|
| 8379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8380 |
|
| 8381 |
GGML_ASSERT(ne12 % ne02 == 0);
|
| 8382 |
GGML_ASSERT(ne13 % ne03 == 0);
|
|
@@ -8385,9 +8430,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8385 |
const int64_t r2 = ne12/ne02;
|
| 8386 |
const int64_t r3 = ne13/ne03;
|
| 8387 |
|
| 8388 |
-
const half alpha_f16 = 1.0f;
|
| 8389 |
-
const half beta_f16 = 0.0f;
|
| 8390 |
-
|
| 8391 |
#if 0
|
| 8392 |
// use cublasGemmEx
|
| 8393 |
{
|
|
@@ -8397,12 +8439,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8397 |
int i02 = i12 / r2;
|
| 8398 |
|
| 8399 |
CUBLAS_CHECK(
|
| 8400 |
-
cublasGemmEx(g_cublas_handles[
|
| 8401 |
ne01, ne11, ne10,
|
| 8402 |
-
|
| 8403 |
-
|
| 8404 |
-
|
| 8405 |
-
|
| 8406 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 8407 |
}
|
| 8408 |
}
|
|
@@ -8414,11 +8456,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8414 |
CUBLAS_CHECK(
|
| 8415 |
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8416 |
ne01, ne11, ne10,
|
| 8417 |
-
|
| 8418 |
-
|
| 8419 |
-
|
| 8420 |
ne12*ne13,
|
| 8421 |
-
|
| 8422 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 8423 |
} else {
|
| 8424 |
// use cublasGemmBatchedEx
|
|
@@ -8435,24 +8477,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8435 |
|
| 8436 |
dim3 block_dims(ne13, ne12);
|
| 8437 |
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
| 8438 |
-
src0_as_f16, src1_as_f16,
|
| 8439 |
ptrs_src, ptrs_dst,
|
| 8440 |
ne12, ne13,
|
| 8441 |
ne23,
|
| 8442 |
nb02, nb03,
|
| 8443 |
nb12, nb13,
|
| 8444 |
-
|
| 8445 |
r2, r3);
|
| 8446 |
CUDA_CHECK(cudaGetLastError());
|
| 8447 |
|
| 8448 |
CUBLAS_CHECK(
|
| 8449 |
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8450 |
ne01, ne11, ne10,
|
| 8451 |
-
|
| 8452 |
-
|
| 8453 |
-
|
| 8454 |
ne23,
|
| 8455 |
-
|
| 8456 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 8457 |
|
| 8458 |
if (ptrs_src_s != 0) {
|
|
@@ -8464,11 +8506,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
| 8464 |
}
|
| 8465 |
#endif
|
| 8466 |
|
| 8467 |
-
|
| 8468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8469 |
|
| 8470 |
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
| 8471 |
-
ggml_cuda_pool_free(dst_f16, dst_as);
|
| 8472 |
}
|
| 8473 |
|
| 8474 |
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -8732,7 +8777,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 8732 |
// TODO: mmq/mmv support
|
| 8733 |
#endif
|
| 8734 |
|
| 8735 |
-
|
|
|
|
| 8736 |
|
| 8737 |
const struct ggml_tensor * ids = src0;
|
| 8738 |
const int32_t id = ((int32_t *) dst->op_params)[0];
|
|
@@ -8740,10 +8786,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 8740 |
|
| 8741 |
std::vector<char> ids_host(ggml_nbytes(ids));
|
| 8742 |
|
|
|
|
|
|
|
| 8743 |
if (ids->backend == GGML_BACKEND_GPU) {
|
| 8744 |
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
| 8745 |
-
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost,
|
| 8746 |
-
CUDA_CHECK(cudaStreamSynchronize(
|
| 8747 |
} else {
|
| 8748 |
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
| 8749 |
}
|
|
@@ -8757,37 +8805,110 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 8757 |
ggml_tensor src1_row = *src1;
|
| 8758 |
ggml_tensor dst_row = *dst;
|
| 8759 |
|
| 8760 |
-
src1_row.
|
| 8761 |
-
dst_row.
|
| 8762 |
-
|
| 8763 |
-
src1_row.nb[2] = src1_row.nb[1];
|
| 8764 |
-
dst_row.nb[2] = dst_row.nb[1];
|
| 8765 |
-
|
| 8766 |
-
src1_row.nb[3] = src1_row.nb[1];
|
| 8767 |
-
dst_row.nb[3] = dst_row.nb[1];
|
| 8768 |
|
| 8769 |
src1_row.extra = &src1_row_extra;
|
| 8770 |
dst_row.extra = &dst_row_extra;
|
| 8771 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8772 |
|
| 8773 |
-
|
| 8774 |
-
|
| 8775 |
-
|
| 8776 |
-
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
| 8777 |
|
| 8778 |
-
|
|
|
|
|
|
|
|
|
|
| 8779 |
|
| 8780 |
-
|
| 8781 |
|
| 8782 |
-
|
| 8783 |
|
| 8784 |
-
|
| 8785 |
-
src1_row.data = (char *) src1->data + i01*src1->nb[1];
|
| 8786 |
|
| 8787 |
-
|
| 8788 |
-
|
| 8789 |
|
| 8790 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8791 |
}
|
| 8792 |
}
|
| 8793 |
|
|
@@ -8898,6 +9019,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
| 8898 |
(void) dst;
|
| 8899 |
}
|
| 8900 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8901 |
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
| 8902 |
const int64_t nrows = ggml_nrows(tensor);
|
| 8903 |
|
|
@@ -8947,13 +9074,12 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
| 8947 |
|
| 8948 |
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
| 8949 |
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 8950 |
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
| 8951 |
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
| 8952 |
}
|
| 8953 |
|
| 8954 |
char * buf;
|
| 8955 |
CUDA_CHECK(cudaMalloc(&buf, size));
|
| 8956 |
-
char * buf_host = (char*)data + offset_split;
|
| 8957 |
|
| 8958 |
// set padding to 0 to avoid possible NaN values
|
| 8959 |
if (size > original_size) {
|
|
@@ -8975,7 +9101,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
| 8975 |
}
|
| 8976 |
|
| 8977 |
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
| 8978 |
-
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
| 8979 |
return;
|
| 8980 |
}
|
| 8981 |
|
|
@@ -9098,11 +9224,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
| 9098 |
|
| 9099 |
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
| 9100 |
|
| 9101 |
-
const bool inplace =
|
| 9102 |
-
tensor->op == GGML_OP_VIEW;
|
| 9103 |
|
| 9104 |
-
if (inplace && (tensor->
|
| 9105 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
| 9106 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
| 9107 |
size_t view_offset = 0;
|
| 9108 |
if (tensor->op == GGML_OP_VIEW) {
|
|
@@ -9182,14 +9307,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
| 9182 |
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
| 9183 |
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
| 9184 |
|
| 9185 |
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
| 9186 |
return false;
|
| 9187 |
}
|
| 9188 |
|
| 9189 |
if (tensor->op == GGML_OP_MUL_MAT) {
|
| 9190 |
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
| 9191 |
#ifndef NDEBUG
|
| 9192 |
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
| 9193 |
#endif
|
| 9194 |
return false;
|
| 9195 |
}
|
|
@@ -9318,6 +9443,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
| 9318 |
return false;
|
| 9319 |
}
|
| 9320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9321 |
if (params->ith != 0) {
|
| 9322 |
return true;
|
| 9323 |
}
|
|
@@ -9391,7 +9520,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
| 9391 |
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
| 9392 |
|
| 9393 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
| 9394 |
-
assert(tensor->view_src->buffer->buft == buffer->buft);
|
| 9395 |
tensor->backend = tensor->view_src->backend;
|
| 9396 |
tensor->extra = tensor->view_src->extra;
|
| 9397 |
return;
|
|
@@ -9422,23 +9551,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
| 9422 |
}
|
| 9423 |
|
| 9424 |
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 9425 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 9426 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 9427 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9428 |
|
| 9429 |
-
|
| 9430 |
|
| 9431 |
-
|
|
|
|
|
|
|
|
|
|
| 9432 |
}
|
| 9433 |
|
| 9434 |
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 9435 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 9436 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 9437 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9439 |
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
|
|
|
| 9440 |
|
| 9441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9442 |
}
|
| 9443 |
|
| 9444 |
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|
@@ -9449,6 +9589,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|
| 9449 |
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
| 9450 |
/* .cpy_tensor_from = */ NULL,
|
| 9451 |
/* .cpy_tensor_to = */ NULL,
|
|
|
|
| 9452 |
};
|
| 9453 |
|
| 9454 |
// cuda buffer type
|
|
@@ -9485,8 +9626,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
| 9485 |
|
| 9486 |
if (ggml_is_quantized(tensor->type)) {
|
| 9487 |
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 9488 |
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
| 9489 |
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
| 9490 |
}
|
| 9491 |
}
|
| 9492 |
|
|
@@ -9501,35 +9641,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
|
| 9501 |
UNUSED(buft);
|
| 9502 |
}
|
| 9503 |
|
| 9504 |
-
static ggml_backend_buffer_type_i
|
| 9505 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 9506 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 9507 |
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
| 9508 |
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
|
|
|
| 9509 |
};
|
| 9510 |
|
| 9511 |
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
| 9512 |
-
static struct ggml_backend_buffer_type
|
| 9513 |
-
|
| 9514 |
-
|
|
|
|
|
|
|
| 9515 |
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
| 9516 |
-
|
| 9517 |
-
/* .iface = */
|
| 9518 |
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
| 9519 |
};
|
| 9520 |
}
|
| 9521 |
-
|
| 9522 |
}
|
| 9523 |
|
| 9524 |
-
return &
|
| 9525 |
}
|
| 9526 |
|
| 9527 |
// host buffer type
|
| 9528 |
|
| 9529 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 9530 |
-
|
| 9531 |
-
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
| 9532 |
-
delete ctx;
|
| 9533 |
}
|
| 9534 |
|
| 9535 |
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
@@ -9542,24 +9683,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
| 9542 |
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
| 9543 |
|
| 9544 |
return buffer;
|
| 9545 |
-
|
| 9546 |
-
UNUSED(buft);
|
| 9547 |
}
|
| 9548 |
|
| 9549 |
-
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
| 9550 |
-
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
| 9551 |
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 9552 |
-
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 9553 |
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
| 9554 |
-
};
|
| 9555 |
-
|
| 9556 |
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
| 9557 |
-
static struct ggml_backend_buffer_type
|
| 9558 |
-
/* .iface = */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9559 |
/* .context = */ nullptr,
|
| 9560 |
};
|
| 9561 |
|
| 9562 |
-
return &
|
| 9563 |
}
|
| 9564 |
|
| 9565 |
// backend
|
|
@@ -9591,8 +9729,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
| 9591 |
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
| 9592 |
|
| 9593 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 9594 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 9595 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 9596 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9597 |
|
| 9598 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
|
@@ -9602,8 +9738,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
| 9602 |
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
| 9603 |
|
| 9604 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 9605 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 9606 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 9607 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9608 |
|
| 9609 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
|
|
|
| 31 |
#define CUDA_R_16F HIPBLAS_R_16F
|
| 32 |
#define CUDA_R_32F HIPBLAS_R_32F
|
| 33 |
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
| 34 |
+
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
| 35 |
#define cublasCreate hipblasCreate
|
| 36 |
#define cublasGemmEx hipblasGemmEx
|
| 37 |
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
|
|
|
| 41 |
#define cublasSetStream hipblasSetStream
|
| 42 |
#define cublasSgemm hipblasSgemm
|
| 43 |
#define cublasStatus_t hipblasStatus_t
|
| 44 |
+
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
| 45 |
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
| 46 |
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
| 47 |
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
|
|
| 60 |
#define cudaGetDeviceProperties hipGetDeviceProperties
|
| 61 |
#define cudaGetErrorString hipGetErrorString
|
| 62 |
#define cudaGetLastError hipGetLastError
|
| 63 |
+
#ifdef GGML_HIP_UMA
|
| 64 |
+
#define cudaMalloc hipMallocManaged
|
| 65 |
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
| 66 |
+
#else
|
| 67 |
#define cudaMalloc hipMalloc
|
| 68 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
| 69 |
+
#endif
|
| 70 |
#define cudaMemcpy hipMemcpy
|
| 71 |
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
| 72 |
#define cudaMemcpyAsync hipMemcpyAsync
|
|
|
|
| 85 |
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
| 86 |
#define cudaStream_t hipStream_t
|
| 87 |
#define cudaSuccess hipSuccess
|
| 88 |
+
#define __trap abort
|
| 89 |
#else
|
| 90 |
#include <cuda_runtime.h>
|
| 91 |
#include <cublas_v2.h>
|
| 92 |
#include <cuda_fp16.h>
|
| 93 |
+
// CUDA 10.2 does not have these macro definitions.
|
| 94 |
+
#ifndef CUBLAS_TF32_TENSOR_OP_MATH
|
| 95 |
+
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
| 96 |
+
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
| 97 |
+
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
| 98 |
+
#define cublasComputeType_t cudaDataType_t
|
| 99 |
+
#endif
|
| 100 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 101 |
|
| 102 |
#include "ggml-cuda.h"
|
|
|
|
| 525 |
|
| 526 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
| 527 |
|
| 528 |
+
[[noreturn]]
|
| 529 |
+
static __device__ void bad_arch() {
|
| 530 |
+
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
|
| 531 |
+
__trap();
|
| 532 |
+
|
| 533 |
+
(void) bad_arch; // suppress unused function warning
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 537 |
#pragma unroll
|
| 538 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
| 1993 |
// second part effectively subtracts 8 from each quant value
|
| 1994 |
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
| 1995 |
#else
|
| 1996 |
+
bad_arch();
|
|
|
|
| 1997 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 1998 |
}
|
| 1999 |
|
|
|
|
| 2030 |
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
| 2031 |
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
| 2032 |
#else
|
| 2033 |
+
bad_arch();
|
|
|
|
| 2034 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2035 |
}
|
| 2036 |
|
|
|
|
| 2065 |
// second part effectively subtracts 16 from each quant value
|
| 2066 |
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
| 2067 |
#else
|
| 2068 |
+
bad_arch();
|
|
|
|
| 2069 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2070 |
}
|
| 2071 |
|
|
|
|
| 2110 |
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
| 2111 |
|
| 2112 |
#else
|
| 2113 |
+
bad_arch();
|
|
|
|
| 2114 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2115 |
}
|
| 2116 |
|
|
|
|
| 2131 |
|
| 2132 |
return d8_0*d8_1 * sumi;
|
| 2133 |
#else
|
| 2134 |
+
bad_arch();
|
|
|
|
| 2135 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2136 |
}
|
| 2137 |
|
|
|
|
| 2161 |
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
| 2162 |
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
| 2163 |
#else
|
| 2164 |
+
bad_arch();
|
|
|
|
| 2165 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2166 |
}
|
| 2167 |
|
|
|
|
| 2196 |
|
| 2197 |
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
| 2198 |
#else
|
| 2199 |
+
bad_arch();
|
|
|
|
| 2200 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2201 |
}
|
| 2202 |
|
|
|
|
| 2233 |
|
| 2234 |
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
| 2235 |
#else
|
| 2236 |
+
bad_arch();
|
|
|
|
| 2237 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2238 |
}
|
| 2239 |
|
|
|
|
| 2273 |
|
| 2274 |
return d3 * sumf;
|
| 2275 |
#else
|
| 2276 |
+
bad_arch();
|
|
|
|
| 2277 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2278 |
}
|
| 2279 |
|
|
|
|
| 2298 |
|
| 2299 |
return d3*d8 * sumi;
|
| 2300 |
#else
|
| 2301 |
+
bad_arch();
|
|
|
|
| 2302 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2303 |
}
|
| 2304 |
|
|
|
|
| 2331 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2332 |
|
| 2333 |
#else
|
| 2334 |
+
bad_arch();
|
|
|
|
| 2335 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2336 |
}
|
| 2337 |
|
|
|
|
| 2364 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2365 |
|
| 2366 |
#else
|
| 2367 |
+
bad_arch();
|
|
|
|
| 2368 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2369 |
}
|
| 2370 |
|
|
|
|
| 2404 |
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
| 2405 |
|
| 2406 |
#else
|
| 2407 |
+
bad_arch();
|
|
|
|
| 2408 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2409 |
}
|
| 2410 |
|
|
|
|
| 2437 |
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
| 2438 |
|
| 2439 |
#else
|
| 2440 |
+
bad_arch();
|
|
|
|
| 2441 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2442 |
}
|
| 2443 |
|
|
|
|
| 2467 |
|
| 2468 |
return d*sumf;
|
| 2469 |
#else
|
| 2470 |
+
bad_arch();
|
|
|
|
| 2471 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2472 |
}
|
| 2473 |
|
|
|
|
| 2498 |
return d6 * sumf_d;
|
| 2499 |
|
| 2500 |
#else
|
| 2501 |
+
bad_arch();
|
|
|
|
| 2502 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 2503 |
}
|
| 2504 |
|
|
|
|
| 3364 |
return dall * sumf_d - dmin * sumf_m;
|
| 3365 |
|
| 3366 |
#else
|
| 3367 |
+
bad_arch();
|
|
|
|
| 3368 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 3369 |
|
| 3370 |
#endif
|
|
|
|
| 3547 |
return d * sumf_d;
|
| 3548 |
|
| 3549 |
#else
|
| 3550 |
+
bad_arch();
|
|
|
|
| 3551 |
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 3552 |
|
| 3553 |
#endif
|
|
|
|
| 3957 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 3958 |
#else
|
| 3959 |
(void) vec_dot_q4_0_q8_1_mul_mat;
|
| 3960 |
+
bad_arch();
|
| 3961 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 3962 |
}
|
| 3963 |
|
|
|
|
| 4026 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4027 |
#else
|
| 4028 |
(void) vec_dot_q4_1_q8_1_mul_mat;
|
| 4029 |
+
bad_arch();
|
| 4030 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4031 |
}
|
| 4032 |
|
|
|
|
| 4093 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4094 |
#else
|
| 4095 |
(void) vec_dot_q5_0_q8_1_mul_mat;
|
| 4096 |
+
bad_arch();
|
| 4097 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4098 |
}
|
| 4099 |
|
|
|
|
| 4160 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4161 |
#else
|
| 4162 |
(void) vec_dot_q5_1_q8_1_mul_mat;
|
| 4163 |
+
bad_arch();
|
| 4164 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4165 |
}
|
| 4166 |
|
|
|
|
| 4227 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4228 |
#else
|
| 4229 |
(void) vec_dot_q8_0_q8_1_mul_mat;
|
| 4230 |
+
bad_arch();
|
| 4231 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4232 |
}
|
| 4233 |
|
|
|
|
| 4294 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4295 |
#else
|
| 4296 |
(void) vec_dot_q2_K_q8_1_mul_mat;
|
| 4297 |
+
bad_arch();
|
| 4298 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4299 |
}
|
| 4300 |
|
|
|
|
| 4363 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4364 |
#else
|
| 4365 |
(void) vec_dot_q3_K_q8_1_mul_mat;
|
| 4366 |
+
bad_arch();
|
| 4367 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4368 |
}
|
| 4369 |
|
|
|
|
| 4432 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4433 |
#else
|
| 4434 |
(void) vec_dot_q4_K_q8_1_mul_mat;
|
| 4435 |
+
bad_arch();
|
| 4436 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4437 |
}
|
| 4438 |
|
|
|
|
| 4499 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4500 |
#else
|
| 4501 |
(void) vec_dot_q5_K_q8_1_mul_mat;
|
| 4502 |
+
bad_arch();
|
| 4503 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4504 |
}
|
| 4505 |
|
|
|
|
| 4568 |
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
| 4569 |
#else
|
| 4570 |
(void) vec_dot_q6_K_q8_1_mul_mat;
|
| 4571 |
+
bad_arch();
|
| 4572 |
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
| 4573 |
}
|
| 4574 |
|
|
|
|
| 5003 |
const int ib = col / n_dims;
|
| 5004 |
const int ic = col % n_dims;
|
| 5005 |
|
| 5006 |
+
if (ib > 0) {
|
| 5007 |
+
const int i = row*ncols + ib*n_dims + ic;
|
| 5008 |
+
|
| 5009 |
+
dst[i + 0] = x[i + 0];
|
| 5010 |
+
dst[i + 1] = x[i + 1];
|
| 5011 |
+
|
| 5012 |
+
return;
|
| 5013 |
+
}
|
| 5014 |
+
|
| 5015 |
+
const int i = row*ncols + ib*n_dims + ic/2;
|
| 5016 |
const int i2 = row/p_delta_rows;
|
| 5017 |
|
| 5018 |
float cur_rot = inv_ndims * ic - ib;
|
|
|
|
| 5273 |
const int ky = (i - kd) / OW;
|
| 5274 |
const int ix = i % OW;
|
| 5275 |
|
| 5276 |
+
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
| 5277 |
+
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
| 5278 |
|
| 5279 |
+
const int64_t offset_dst =
|
| 5280 |
(blockIdx.y * OW + ix) * CHW +
|
| 5281 |
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
| 5282 |
|
| 5283 |
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 5284 |
dst[offset_dst] = __float2half(0.0f);
|
| 5285 |
} else {
|
| 5286 |
+
const int64_t offset_src = blockIdx.z * offset_delta;
|
| 5287 |
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
| 5288 |
}
|
| 5289 |
}
|
|
|
|
| 6828 |
break;
|
| 6829 |
default:
|
| 6830 |
// TODO: k-quants
|
| 6831 |
+
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
| 6832 |
GGML_ASSERT(false);
|
| 6833 |
break;
|
| 6834 |
}
|
|
|
|
| 7072 |
|
| 7073 |
(void) src1;
|
| 7074 |
(void) dst;
|
| 7075 |
+
(void) src1_dd;
|
| 7076 |
}
|
| 7077 |
|
| 7078 |
inline void ggml_cuda_op_pad(
|
|
|
|
| 7089 |
|
| 7090 |
(void) src1;
|
| 7091 |
(void) dst;
|
| 7092 |
+
(void) src1_dd;
|
| 7093 |
}
|
| 7094 |
|
| 7095 |
inline void ggml_cuda_op_rms_norm(
|
|
|
|
| 7393 |
|
| 7394 |
const int compute_capability = g_compute_capabilities[id];
|
| 7395 |
|
| 7396 |
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 7397 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 7398 |
half * src0_as_f16 = nullptr;
|
| 7399 |
size_t src0_as = 0;
|
|
|
|
| 7707 |
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
| 7708 |
|
| 7709 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
|
|
| 7710 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 7711 |
|
| 7712 |
float scale;
|
| 7713 |
+
memcpy(&scale, dst->op_params, sizeof(float));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7714 |
|
| 7715 |
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
| 7716 |
CUDA_CHECK(cudaGetLastError());
|
|
|
|
| 7757 |
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
| 7758 |
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
| 7759 |
|
|
|
|
|
|
|
| 7760 |
// dd = data device
|
| 7761 |
float * src0_ddf = nullptr;
|
| 7762 |
float * src1_ddf = nullptr;
|
|
|
|
| 7777 |
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
| 7778 |
}
|
| 7779 |
|
| 7780 |
+
if (use_src1) {
|
| 7781 |
if (src1_on_device) {
|
| 7782 |
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
| 7783 |
} else {
|
|
|
|
| 7825 |
}
|
| 7826 |
|
| 7827 |
#ifdef NDEBUG
|
| 7828 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 7829 |
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
| 7830 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 7831 |
+
}
|
| 7832 |
+
|
| 7833 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7834 |
CUDA_CHECK(ggml_cuda_set_device(id));
|
| 7835 |
|
|
|
|
| 7881 |
const int nb2 = dst->nb[2];
|
| 7882 |
const int nb3 = dst->nb[3];
|
| 7883 |
|
|
|
|
|
|
|
| 7884 |
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
| 7885 |
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
| 7886 |
|
|
|
|
| 8311 |
}
|
| 8312 |
|
| 8313 |
static __global__ void k_compute_batched_ptrs(
|
| 8314 |
+
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
| 8315 |
const void ** ptrs_src, void ** ptrs_dst,
|
| 8316 |
+
int64_t ne12, int64_t ne13,
|
| 8317 |
+
int64_t ne23,
|
| 8318 |
+
size_t nb02, size_t nb03,
|
| 8319 |
+
size_t nb12, size_t nb13,
|
| 8320 |
+
size_t nbd2, size_t nbd3,
|
| 8321 |
+
int64_t r2, int64_t r3) {
|
| 8322 |
+
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
| 8323 |
+
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
| 8324 |
|
| 8325 |
if (i13 >= ne13 || i12 >= ne12) {
|
| 8326 |
return;
|
| 8327 |
}
|
| 8328 |
|
| 8329 |
+
int64_t i03 = i13 / r3;
|
| 8330 |
+
int64_t i02 = i12 / r2;
|
| 8331 |
|
| 8332 |
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
| 8333 |
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
| 8334 |
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
| 8335 |
}
|
| 8336 |
|
| 8337 |
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 8387 |
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
| 8388 |
|
| 8389 |
size_t dst_as = 0;
|
| 8390 |
+
|
| 8391 |
+
half * dst_f16 = nullptr;
|
| 8392 |
+
char * dst_t = nullptr;
|
| 8393 |
+
|
| 8394 |
+
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
| 8395 |
+
cudaDataType_t cu_data_type = CUDA_R_16F;
|
| 8396 |
+
|
| 8397 |
+
// dst strides
|
| 8398 |
+
size_t nbd2 = dst->nb[2];
|
| 8399 |
+
size_t nbd3 = dst->nb[3];
|
| 8400 |
+
|
| 8401 |
+
const half alpha_f16 = 1.0f;
|
| 8402 |
+
const half beta_f16 = 0.0f;
|
| 8403 |
+
|
| 8404 |
+
const float alpha_f32 = 1.0f;
|
| 8405 |
+
const float beta_f32 = 0.0f;
|
| 8406 |
+
|
| 8407 |
+
const void * alpha = &alpha_f16;
|
| 8408 |
+
const void * beta = &beta_f16;
|
| 8409 |
+
|
| 8410 |
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 8411 |
+
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
| 8412 |
+
dst_t = (char *) dst_f16;
|
| 8413 |
+
|
| 8414 |
+
nbd2 /= sizeof(float) / sizeof(half);
|
| 8415 |
+
nbd3 /= sizeof(float) / sizeof(half);
|
| 8416 |
+
} else {
|
| 8417 |
+
dst_t = (char *) dst_ddf;
|
| 8418 |
+
|
| 8419 |
+
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 8420 |
+
cu_data_type = CUDA_R_32F;
|
| 8421 |
+
|
| 8422 |
+
alpha = &alpha_f32;
|
| 8423 |
+
beta = &beta_f32;
|
| 8424 |
+
}
|
| 8425 |
|
| 8426 |
GGML_ASSERT(ne12 % ne02 == 0);
|
| 8427 |
GGML_ASSERT(ne13 % ne03 == 0);
|
|
|
|
| 8430 |
const int64_t r2 = ne12/ne02;
|
| 8431 |
const int64_t r3 = ne13/ne03;
|
| 8432 |
|
|
|
|
|
|
|
|
|
|
| 8433 |
#if 0
|
| 8434 |
// use cublasGemmEx
|
| 8435 |
{
|
|
|
|
| 8439 |
int i02 = i12 / r2;
|
| 8440 |
|
| 8441 |
CUBLAS_CHECK(
|
| 8442 |
+
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8443 |
ne01, ne11, ne10,
|
| 8444 |
+
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
| 8445 |
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
| 8446 |
+
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
| 8447 |
+
cu_compute_type,
|
| 8448 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 8449 |
}
|
| 8450 |
}
|
|
|
|
| 8456 |
CUBLAS_CHECK(
|
| 8457 |
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8458 |
ne01, ne11, ne10,
|
| 8459 |
+
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
| 8460 |
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
| 8461 |
+
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
| 8462 |
ne12*ne13,
|
| 8463 |
+
cu_compute_type,
|
| 8464 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 8465 |
} else {
|
| 8466 |
// use cublasGemmBatchedEx
|
|
|
|
| 8477 |
|
| 8478 |
dim3 block_dims(ne13, ne12);
|
| 8479 |
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
| 8480 |
+
src0_as_f16, src1_as_f16, dst_t,
|
| 8481 |
ptrs_src, ptrs_dst,
|
| 8482 |
ne12, ne13,
|
| 8483 |
ne23,
|
| 8484 |
nb02, nb03,
|
| 8485 |
nb12, nb13,
|
| 8486 |
+
nbd2, nbd3,
|
| 8487 |
r2, r3);
|
| 8488 |
CUDA_CHECK(cudaGetLastError());
|
| 8489 |
|
| 8490 |
CUBLAS_CHECK(
|
| 8491 |
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
| 8492 |
ne01, ne11, ne10,
|
| 8493 |
+
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
| 8494 |
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
| 8495 |
+
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
| 8496 |
ne23,
|
| 8497 |
+
cu_compute_type,
|
| 8498 |
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
| 8499 |
|
| 8500 |
if (ptrs_src_s != 0) {
|
|
|
|
| 8506 |
}
|
| 8507 |
#endif
|
| 8508 |
|
| 8509 |
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 8510 |
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
| 8511 |
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
| 8512 |
+
|
| 8513 |
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
| 8514 |
+
}
|
| 8515 |
|
| 8516 |
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
|
|
|
| 8517 |
}
|
| 8518 |
|
| 8519 |
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 8777 |
// TODO: mmq/mmv support
|
| 8778 |
#endif
|
| 8779 |
|
| 8780 |
+
const int64_t nb11 = src1->nb[1];
|
| 8781 |
+
const int64_t nb1 = dst->nb[1];
|
| 8782 |
|
| 8783 |
const struct ggml_tensor * ids = src0;
|
| 8784 |
const int32_t id = ((int32_t *) dst->op_params)[0];
|
|
|
|
| 8786 |
|
| 8787 |
std::vector<char> ids_host(ggml_nbytes(ids));
|
| 8788 |
|
| 8789 |
+
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
| 8790 |
+
|
| 8791 |
if (ids->backend == GGML_BACKEND_GPU) {
|
| 8792 |
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
| 8793 |
+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
| 8794 |
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
| 8795 |
} else {
|
| 8796 |
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
| 8797 |
}
|
|
|
|
| 8805 |
ggml_tensor src1_row = *src1;
|
| 8806 |
ggml_tensor dst_row = *dst;
|
| 8807 |
|
| 8808 |
+
src1_row.backend = GGML_BACKEND_GPU;
|
| 8809 |
+
dst_row.backend = GGML_BACKEND_GPU;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8810 |
|
| 8811 |
src1_row.extra = &src1_row_extra;
|
| 8812 |
dst_row.extra = &dst_row_extra;
|
| 8813 |
|
| 8814 |
+
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
| 8815 |
+
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
| 8816 |
+
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
| 8817 |
+
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
| 8818 |
|
| 8819 |
+
if (src1->ne[1] == 1) {
|
| 8820 |
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
| 8821 |
+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
|
|
|
| 8822 |
|
| 8823 |
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
| 8824 |
+
//int32_t row_id;
|
| 8825 |
+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
| 8826 |
+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
| 8827 |
|
| 8828 |
+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
| 8829 |
|
| 8830 |
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 8831 |
|
| 8832 |
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
|
|
|
| 8833 |
|
| 8834 |
+
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
|
| 8835 |
+
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
|
| 8836 |
|
| 8837 |
+
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
|
| 8838 |
+
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
| 8839 |
+
|
| 8840 |
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
| 8841 |
+
}
|
| 8842 |
+
} else {
|
| 8843 |
+
size_t as_src1, as_dst;
|
| 8844 |
+
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
|
| 8845 |
+
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
| 8846 |
+
|
| 8847 |
+
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
| 8848 |
+
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
| 8849 |
+
|
| 8850 |
+
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
| 8851 |
+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
| 8852 |
+
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
| 8853 |
+
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
| 8854 |
+
|
| 8855 |
+
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
| 8856 |
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
| 8857 |
+
|
| 8858 |
+
int64_t num_src1_rows = 0;
|
| 8859 |
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
| 8860 |
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
| 8861 |
+
|
| 8862 |
+
if (row_id_i != row_id) {
|
| 8863 |
+
continue;
|
| 8864 |
+
}
|
| 8865 |
+
|
| 8866 |
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 8867 |
+
|
| 8868 |
+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
| 8869 |
+
nb11, src1_kind, stream));
|
| 8870 |
+
num_src1_rows++;
|
| 8871 |
+
}
|
| 8872 |
+
|
| 8873 |
+
if (num_src1_rows == 0) {
|
| 8874 |
+
continue;
|
| 8875 |
+
}
|
| 8876 |
+
|
| 8877 |
+
src1_row.ne[1] = num_src1_rows;
|
| 8878 |
+
dst_row.ne[1] = num_src1_rows;
|
| 8879 |
+
|
| 8880 |
+
src1_row.nb[1] = nb11;
|
| 8881 |
+
src1_row.nb[2] = num_src1_rows*nb11;
|
| 8882 |
+
src1_row.nb[3] = num_src1_rows*nb11;
|
| 8883 |
+
|
| 8884 |
+
dst_row.nb[1] = nb1;
|
| 8885 |
+
dst_row.nb[2] = num_src1_rows*nb1;
|
| 8886 |
+
dst_row.nb[3] = num_src1_rows*nb1;
|
| 8887 |
+
|
| 8888 |
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
| 8889 |
+
|
| 8890 |
+
num_src1_rows = 0;
|
| 8891 |
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
| 8892 |
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
| 8893 |
+
|
| 8894 |
+
if (row_id_i != row_id) {
|
| 8895 |
+
continue;
|
| 8896 |
+
}
|
| 8897 |
+
|
| 8898 |
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 8899 |
+
|
| 8900 |
+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
| 8901 |
+
nb1, dst_kind, stream));
|
| 8902 |
+
num_src1_rows++;
|
| 8903 |
+
}
|
| 8904 |
+
}
|
| 8905 |
+
|
| 8906 |
+
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
| 8907 |
+
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
| 8908 |
+
}
|
| 8909 |
+
|
| 8910 |
+
if (dst->backend == GGML_BACKEND_CPU) {
|
| 8911 |
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
| 8912 |
}
|
| 8913 |
}
|
| 8914 |
|
|
|
|
| 9019 |
(void) dst;
|
| 9020 |
}
|
| 9021 |
|
| 9022 |
+
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
| 9023 |
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 9024 |
+
|
| 9025 |
+
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
| 9026 |
+
}
|
| 9027 |
+
|
| 9028 |
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
| 9029 |
const int64_t nrows = ggml_nrows(tensor);
|
| 9030 |
|
|
|
|
| 9074 |
|
| 9075 |
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
| 9076 |
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 9077 |
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
|
|
| 9078 |
}
|
| 9079 |
|
| 9080 |
char * buf;
|
| 9081 |
CUDA_CHECK(cudaMalloc(&buf, size));
|
| 9082 |
+
char * buf_host = (char *)data + offset_split;
|
| 9083 |
|
| 9084 |
// set padding to 0 to avoid possible NaN values
|
| 9085 |
if (size > original_size) {
|
|
|
|
| 9101 |
}
|
| 9102 |
|
| 9103 |
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
| 9104 |
+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
| 9105 |
return;
|
| 9106 |
}
|
| 9107 |
|
|
|
|
| 9224 |
|
| 9225 |
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
| 9226 |
|
| 9227 |
+
const bool inplace = tensor->view_src != nullptr;
|
|
|
|
| 9228 |
|
| 9229 |
+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
| 9230 |
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
| 9231 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
| 9232 |
size_t view_offset = 0;
|
| 9233 |
if (tensor->op == GGML_OP_VIEW) {
|
|
|
|
| 9307 |
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
| 9308 |
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
| 9309 |
|
| 9310 |
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
| 9311 |
return false;
|
| 9312 |
}
|
| 9313 |
|
| 9314 |
if (tensor->op == GGML_OP_MUL_MAT) {
|
| 9315 |
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
| 9316 |
#ifndef NDEBUG
|
| 9317 |
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
| 9318 |
#endif
|
| 9319 |
return false;
|
| 9320 |
}
|
|
|
|
| 9443 |
return false;
|
| 9444 |
}
|
| 9445 |
|
| 9446 |
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
| 9447 |
+
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
| 9448 |
+
}
|
| 9449 |
+
|
| 9450 |
if (params->ith != 0) {
|
| 9451 |
return true;
|
| 9452 |
}
|
|
|
|
| 9520 |
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
| 9521 |
|
| 9522 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
| 9523 |
+
assert(tensor->view_src->buffer->buft == buffer->buft);
|
| 9524 |
tensor->backend = tensor->view_src->backend;
|
| 9525 |
tensor->extra = tensor->view_src->extra;
|
| 9526 |
return;
|
|
|
|
| 9551 |
}
|
| 9552 |
|
| 9553 |
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
| 9554 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9555 |
|
| 9556 |
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
| 9557 |
|
| 9558 |
+
ggml_cuda_set_device(ctx->device);
|
| 9559 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 9560 |
+
|
| 9561 |
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
| 9562 |
}
|
| 9563 |
|
| 9564 |
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
| 9565 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9566 |
|
| 9567 |
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
| 9568 |
+
|
| 9569 |
+
ggml_cuda_set_device(ctx->device);
|
| 9570 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 9571 |
+
|
| 9572 |
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
| 9573 |
+
}
|
| 9574 |
|
| 9575 |
+
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 9576 |
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
| 9577 |
+
|
| 9578 |
+
ggml_cuda_set_device(ctx->device);
|
| 9579 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 9580 |
+
|
| 9581 |
+
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
| 9582 |
}
|
| 9583 |
|
| 9584 |
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|
|
|
| 9589 |
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
| 9590 |
/* .cpy_tensor_from = */ NULL,
|
| 9591 |
/* .cpy_tensor_to = */ NULL,
|
| 9592 |
+
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
| 9593 |
};
|
| 9594 |
|
| 9595 |
// cuda buffer type
|
|
|
|
| 9626 |
|
| 9627 |
if (ggml_is_quantized(tensor->type)) {
|
| 9628 |
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 9629 |
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
|
|
| 9630 |
}
|
| 9631 |
}
|
| 9632 |
|
|
|
|
| 9641 |
UNUSED(buft);
|
| 9642 |
}
|
| 9643 |
|
| 9644 |
+
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
| 9645 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 9646 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 9647 |
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
| 9648 |
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
| 9649 |
+
/* .is_host = */ nullptr,
|
| 9650 |
};
|
| 9651 |
|
| 9652 |
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
| 9653 |
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
| 9654 |
+
|
| 9655 |
+
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
| 9656 |
+
|
| 9657 |
+
if (!ggml_backend_cuda_buffer_type_initialized) {
|
| 9658 |
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
| 9659 |
+
ggml_backend_cuda_buffer_types[i] = {
|
| 9660 |
+
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
| 9661 |
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
| 9662 |
};
|
| 9663 |
}
|
| 9664 |
+
ggml_backend_cuda_buffer_type_initialized = true;
|
| 9665 |
}
|
| 9666 |
|
| 9667 |
+
return &ggml_backend_cuda_buffer_types[device];
|
| 9668 |
}
|
| 9669 |
|
| 9670 |
// host buffer type
|
| 9671 |
|
| 9672 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 9673 |
+
CUDA_CHECK(cudaFreeHost(buffer->context));
|
|
|
|
|
|
|
| 9674 |
}
|
| 9675 |
|
| 9676 |
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
|
|
| 9683 |
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
| 9684 |
|
| 9685 |
return buffer;
|
|
|
|
|
|
|
| 9686 |
}
|
| 9687 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9688 |
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
| 9689 |
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
| 9690 |
+
/* .iface = */ {
|
| 9691 |
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
| 9692 |
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 9693 |
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 9694 |
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
| 9695 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 9696 |
+
},
|
| 9697 |
/* .context = */ nullptr,
|
| 9698 |
};
|
| 9699 |
|
| 9700 |
+
return &ggml_backend_cuda_buffer_type_host;
|
| 9701 |
}
|
| 9702 |
|
| 9703 |
// backend
|
|
|
|
| 9729 |
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
| 9730 |
|
| 9731 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
|
|
|
|
|
| 9732 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9733 |
|
| 9734 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
|
|
|
| 9738 |
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
| 9739 |
|
| 9740 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
|
|
|
|
|
| 9741 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 9742 |
|
| 9743 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
ggml-metal.h
CHANGED
|
@@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|
| 98 |
|
| 99 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
| 100 |
|
|
|
|
|
|
|
| 101 |
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
|
|
|
| 102 |
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 103 |
|
| 104 |
// helper to check if the device supports a specific family
|
|
|
|
| 98 |
|
| 99 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
| 100 |
|
| 101 |
+
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
| 102 |
+
|
| 103 |
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
| 104 |
+
|
| 105 |
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 106 |
|
| 107 |
// helper to check if the device supports a specific family
|
ggml-metal.m
CHANGED
|
@@ -180,7 +180,15 @@ struct ggml_metal_context {
|
|
| 180 |
@implementation GGMLMetalClass
|
| 181 |
@end
|
| 182 |
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
void * ggml_metal_log_user_data = NULL;
|
| 185 |
|
| 186 |
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
|
@@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|
| 607 |
}
|
| 608 |
|
| 609 |
// temporarily defined here for compatibility between ggml-backend and the old API
|
| 610 |
-
|
| 611 |
-
|
|
|
|
|
|
|
| 612 |
|
| 613 |
id<MTLBuffer> metal;
|
| 614 |
};
|
| 615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
// finds the Metal buffer that contains the tensor data on the GPU device
|
| 617 |
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
| 618 |
// Metal buffer based on the host memory pointer
|
|
@@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|
| 622 |
|
| 623 |
const int64_t tsize = ggml_nbytes(t);
|
| 624 |
|
|
|
|
|
|
|
| 625 |
// compatibility with ggml-backend
|
| 626 |
-
if (
|
| 627 |
-
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
-
|
|
|
|
|
|
|
| 630 |
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
-
|
| 634 |
|
| 635 |
-
return
|
| 636 |
}
|
| 637 |
|
| 638 |
// find the view that contains the tensor fully
|
|
@@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
|
|
| 1261 |
{
|
| 1262 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 1263 |
|
| 1264 |
-
const float scale = *(const float *)
|
| 1265 |
|
| 1266 |
int64_t n = ggml_nelements(dst);
|
| 1267 |
|
|
@@ -1272,8 +1304,8 @@ void ggml_metal_graph_compute(
|
|
| 1272 |
[encoder setComputePipelineState:ctx->pipeline_scale];
|
| 1273 |
}
|
| 1274 |
|
| 1275 |
-
[encoder setBuffer:id_src0
|
| 1276 |
-
[encoder setBuffer:id_dst
|
| 1277 |
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
| 1278 |
|
| 1279 |
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
@@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
|
|
| 2361 |
|
| 2362 |
// backend interface
|
| 2363 |
|
|
|
|
| 2364 |
static id<MTLDevice> g_backend_device = nil;
|
| 2365 |
static int g_backend_device_ref_count = 0;
|
| 2366 |
|
|
@@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
|
|
| 2388 |
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 2389 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2390 |
|
| 2391 |
-
return ctx->
|
| 2392 |
}
|
| 2393 |
|
| 2394 |
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 2395 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2396 |
|
| 2397 |
-
|
|
|
|
|
|
|
| 2398 |
ggml_backend_metal_free_device();
|
| 2399 |
|
| 2400 |
-
|
| 2401 |
-
|
|
|
|
| 2402 |
|
| 2403 |
-
|
| 2404 |
}
|
| 2405 |
|
| 2406 |
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2407 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 2408 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 2409 |
-
|
| 2410 |
memcpy((char *)tensor->data + offset, data, size);
|
| 2411 |
|
| 2412 |
UNUSED(buffer);
|
| 2413 |
}
|
| 2414 |
|
| 2415 |
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 2416 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 2417 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 2418 |
-
|
| 2419 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 2420 |
|
| 2421 |
UNUSED(buffer);
|
|
@@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
|
|
| 2433 |
UNUSED(buffer);
|
| 2434 |
}
|
| 2435 |
|
| 2436 |
-
static
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2437 |
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
| 2438 |
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
| 2439 |
/* .init_tensor = */ NULL,
|
|
@@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
|
| 2441 |
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
| 2442 |
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
| 2443 |
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
|
|
|
| 2444 |
};
|
| 2445 |
|
|
|
|
|
|
|
| 2446 |
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 2447 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2448 |
|
|
@@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|
| 2453 |
size_aligned += (size_page - (size_aligned % size_page));
|
| 2454 |
}
|
| 2455 |
|
| 2456 |
-
|
| 2457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2458 |
length:size_aligned
|
| 2459 |
options:MTLResourceStorageModeShared
|
| 2460 |
deallocator:nil];
|
| 2461 |
|
| 2462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2463 |
}
|
| 2464 |
|
| 2465 |
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
@@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
|
|
| 2470 |
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 2471 |
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
| 2472 |
|
| 2473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2474 |
}
|
| 2475 |
|
| 2476 |
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
@@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
| 2480 |
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
| 2481 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 2482 |
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
|
|
|
| 2483 |
},
|
| 2484 |
/* .context = */ NULL,
|
| 2485 |
};
|
|
@@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
| 2487 |
return &ggml_backend_buffer_type_metal;
|
| 2488 |
}
|
| 2489 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2490 |
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
| 2491 |
return "Metal";
|
| 2492 |
|
|
@@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|
| 2499 |
free(backend);
|
| 2500 |
}
|
| 2501 |
|
| 2502 |
-
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
| 2503 |
-
UNUSED(backend);
|
| 2504 |
-
}
|
| 2505 |
-
|
| 2506 |
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
| 2507 |
return ggml_backend_metal_buffer_type();
|
| 2508 |
|
|
@@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
|
|
| 2529 |
/* .get_tensor_async = */ NULL,
|
| 2530 |
/* .cpy_tensor_from_async = */ NULL,
|
| 2531 |
/* .cpy_tensor_to_async = */ NULL,
|
| 2532 |
-
/* .synchronize = */
|
| 2533 |
-
/* .graph_plan_create = */ NULL,
|
| 2534 |
/* .graph_plan_free = */ NULL,
|
| 2535 |
/* .graph_plan_compute = */ NULL,
|
| 2536 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
| 2537 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
| 2538 |
};
|
| 2539 |
|
| 2540 |
-
// TODO: make a common log callback for all backends in ggml-backend
|
| 2541 |
-
static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
| 2542 |
-
fprintf(stderr, "%s", msg);
|
| 2543 |
-
|
| 2544 |
-
UNUSED(level);
|
| 2545 |
-
UNUSED(user_data);
|
| 2546 |
-
}
|
| 2547 |
-
|
| 2548 |
ggml_backend_t ggml_backend_metal_init(void) {
|
| 2549 |
-
ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
|
| 2550 |
-
|
| 2551 |
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
| 2552 |
|
| 2553 |
if (ctx == NULL) {
|
|
|
|
| 180 |
@implementation GGMLMetalClass
|
| 181 |
@end
|
| 182 |
|
| 183 |
+
|
| 184 |
+
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
| 185 |
+
fprintf(stderr, "%s", msg);
|
| 186 |
+
|
| 187 |
+
UNUSED(level);
|
| 188 |
+
UNUSED(user_data);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
| 192 |
void * ggml_metal_log_user_data = NULL;
|
| 193 |
|
| 194 |
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
|
|
|
| 615 |
}
|
| 616 |
|
| 617 |
// temporarily defined here for compatibility between ggml-backend and the old API
|
| 618 |
+
|
| 619 |
+
struct ggml_backend_metal_buffer {
|
| 620 |
+
void * data;
|
| 621 |
+
size_t size;
|
| 622 |
|
| 623 |
id<MTLBuffer> metal;
|
| 624 |
};
|
| 625 |
|
| 626 |
+
struct ggml_backend_metal_buffer_context {
|
| 627 |
+
void * all_data;
|
| 628 |
+
size_t all_size;
|
| 629 |
+
bool owned;
|
| 630 |
+
|
| 631 |
+
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
| 632 |
+
int n_buffers;
|
| 633 |
+
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
| 634 |
+
};
|
| 635 |
+
|
| 636 |
// finds the Metal buffer that contains the tensor data on the GPU device
|
| 637 |
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
| 638 |
// Metal buffer based on the host memory pointer
|
|
|
|
| 642 |
|
| 643 |
const int64_t tsize = ggml_nbytes(t);
|
| 644 |
|
| 645 |
+
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
| 646 |
+
|
| 647 |
// compatibility with ggml-backend
|
| 648 |
+
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
| 649 |
+
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
| 650 |
+
|
| 651 |
+
// find the view that contains the tensor fully
|
| 652 |
+
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
| 653 |
+
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
| 654 |
|
| 655 |
+
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
| 656 |
+
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
| 657 |
+
*offs = (size_t) ioffs;
|
| 658 |
|
| 659 |
+
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
| 660 |
+
|
| 661 |
+
return buf_ctx->buffers[i].metal;
|
| 662 |
+
}
|
| 663 |
+
}
|
| 664 |
|
| 665 |
+
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
| 666 |
|
| 667 |
+
return nil;
|
| 668 |
}
|
| 669 |
|
| 670 |
// find the view that contains the tensor fully
|
|
|
|
| 1293 |
{
|
| 1294 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 1295 |
|
| 1296 |
+
const float scale = *(const float *) dst->op_params;
|
| 1297 |
|
| 1298 |
int64_t n = ggml_nelements(dst);
|
| 1299 |
|
|
|
|
| 1304 |
[encoder setComputePipelineState:ctx->pipeline_scale];
|
| 1305 |
}
|
| 1306 |
|
| 1307 |
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
| 1308 |
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
| 1309 |
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
| 1310 |
|
| 1311 |
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
|
|
| 2393 |
|
| 2394 |
// backend interface
|
| 2395 |
|
| 2396 |
+
// default buffer
|
| 2397 |
static id<MTLDevice> g_backend_device = nil;
|
| 2398 |
static int g_backend_device_ref_count = 0;
|
| 2399 |
|
|
|
|
| 2421 |
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 2422 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2423 |
|
| 2424 |
+
return ctx->all_data;
|
| 2425 |
}
|
| 2426 |
|
| 2427 |
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 2428 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2429 |
|
| 2430 |
+
for (int i = 0; i < ctx->n_buffers; i++) {
|
| 2431 |
+
[ctx->buffers[i].metal release];
|
| 2432 |
+
}
|
| 2433 |
ggml_backend_metal_free_device();
|
| 2434 |
|
| 2435 |
+
if (ctx->owned) {
|
| 2436 |
+
free(ctx->all_data);
|
| 2437 |
+
}
|
| 2438 |
|
| 2439 |
+
free(ctx);
|
| 2440 |
}
|
| 2441 |
|
| 2442 |
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
|
|
|
| 2443 |
memcpy((char *)tensor->data + offset, data, size);
|
| 2444 |
|
| 2445 |
UNUSED(buffer);
|
| 2446 |
}
|
| 2447 |
|
| 2448 |
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
|
|
|
| 2449 |
memcpy(data, (const char *)tensor->data + offset, size);
|
| 2450 |
|
| 2451 |
UNUSED(buffer);
|
|
|
|
| 2463 |
UNUSED(buffer);
|
| 2464 |
}
|
| 2465 |
|
| 2466 |
+
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 2467 |
+
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2468 |
+
|
| 2469 |
+
memset(ctx->all_data, value, ctx->all_size);
|
| 2470 |
+
}
|
| 2471 |
+
|
| 2472 |
+
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
| 2473 |
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
| 2474 |
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
| 2475 |
/* .init_tensor = */ NULL,
|
|
|
|
| 2477 |
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
| 2478 |
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
| 2479 |
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
| 2480 |
+
/* .clear = */ ggml_backend_metal_buffer_clear,
|
| 2481 |
};
|
| 2482 |
|
| 2483 |
+
// default buffer type
|
| 2484 |
+
|
| 2485 |
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 2486 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2487 |
|
|
|
|
| 2492 |
size_aligned += (size_page - (size_aligned % size_page));
|
| 2493 |
}
|
| 2494 |
|
| 2495 |
+
id<MTLDevice> device = ggml_backend_metal_get_device();
|
| 2496 |
+
|
| 2497 |
+
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
| 2498 |
+
ctx->all_size = size_aligned;
|
| 2499 |
+
ctx->owned = true;
|
| 2500 |
+
ctx->n_buffers = 1;
|
| 2501 |
+
|
| 2502 |
+
ctx->buffers[0].data = ctx->all_data;
|
| 2503 |
+
ctx->buffers[0].size = size;
|
| 2504 |
+
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
| 2505 |
length:size_aligned
|
| 2506 |
options:MTLResourceStorageModeShared
|
| 2507 |
deallocator:nil];
|
| 2508 |
|
| 2509 |
+
if (ctx->buffers[0].metal == nil) {
|
| 2510 |
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
| 2511 |
+
free(ctx);
|
| 2512 |
+
ggml_backend_metal_free_device();
|
| 2513 |
+
return NULL;
|
| 2514 |
+
}
|
| 2515 |
+
|
| 2516 |
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
| 2517 |
+
|
| 2518 |
+
|
| 2519 |
+
#if TARGET_OS_OSX
|
| 2520 |
+
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
| 2521 |
+
device.currentAllocatedSize / 1024.0 / 1024.0,
|
| 2522 |
+
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
| 2523 |
+
|
| 2524 |
+
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
| 2525 |
+
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
| 2526 |
+
} else {
|
| 2527 |
+
GGML_METAL_LOG_INFO("\n");
|
| 2528 |
+
}
|
| 2529 |
+
#else
|
| 2530 |
+
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
| 2531 |
+
#endif
|
| 2532 |
+
|
| 2533 |
+
|
| 2534 |
+
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
| 2535 |
}
|
| 2536 |
|
| 2537 |
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
|
|
| 2542 |
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 2543 |
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
| 2544 |
|
| 2545 |
+
UNUSED(buft);
|
| 2546 |
+
}
|
| 2547 |
+
|
| 2548 |
+
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 2549 |
+
return true;
|
| 2550 |
+
|
| 2551 |
+
UNUSED(buft);
|
| 2552 |
}
|
| 2553 |
|
| 2554 |
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
|
|
| 2558 |
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
| 2559 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 2560 |
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
| 2561 |
+
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
| 2562 |
},
|
| 2563 |
/* .context = */ NULL,
|
| 2564 |
};
|
|
|
|
| 2566 |
return &ggml_backend_buffer_type_metal;
|
| 2567 |
}
|
| 2568 |
|
| 2569 |
+
// buffer from ptr
|
| 2570 |
+
|
| 2571 |
+
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
| 2572 |
+
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2573 |
+
|
| 2574 |
+
ctx->all_data = data;
|
| 2575 |
+
ctx->all_size = size;
|
| 2576 |
+
ctx->owned = false;
|
| 2577 |
+
ctx->n_buffers = 0;
|
| 2578 |
+
|
| 2579 |
+
const size_t size_page = sysconf(_SC_PAGESIZE);
|
| 2580 |
+
size_t size_aligned = size;
|
| 2581 |
+
if ((size_aligned % size_page) != 0) {
|
| 2582 |
+
size_aligned += (size_page - (size_aligned % size_page));
|
| 2583 |
+
}
|
| 2584 |
+
|
| 2585 |
+
id<MTLDevice> device = ggml_backend_metal_get_device();
|
| 2586 |
+
|
| 2587 |
+
// the buffer fits into the max buffer size allowed by the device
|
| 2588 |
+
if (size_aligned <= device.maxBufferLength) {
|
| 2589 |
+
ctx->buffers[ctx->n_buffers].data = data;
|
| 2590 |
+
ctx->buffers[ctx->n_buffers].size = size;
|
| 2591 |
+
|
| 2592 |
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 2593 |
+
|
| 2594 |
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 2595 |
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
| 2596 |
+
return false;
|
| 2597 |
+
}
|
| 2598 |
+
|
| 2599 |
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
| 2600 |
+
|
| 2601 |
+
++ctx->n_buffers;
|
| 2602 |
+
} else {
|
| 2603 |
+
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
| 2604 |
+
// one of the views
|
| 2605 |
+
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
| 2606 |
+
const size_t size_step = device.maxBufferLength - size_ovlp;
|
| 2607 |
+
const size_t size_view = device.maxBufferLength;
|
| 2608 |
+
|
| 2609 |
+
for (size_t i = 0; i < size; i += size_step) {
|
| 2610 |
+
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
| 2611 |
+
|
| 2612 |
+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
| 2613 |
+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
| 2614 |
+
|
| 2615 |
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 2616 |
+
|
| 2617 |
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 2618 |
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
| 2619 |
+
return false;
|
| 2620 |
+
}
|
| 2621 |
+
|
| 2622 |
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
|
| 2623 |
+
if (i + size_step < size) {
|
| 2624 |
+
GGML_METAL_LOG_INFO("\n");
|
| 2625 |
+
}
|
| 2626 |
+
|
| 2627 |
+
++ctx->n_buffers;
|
| 2628 |
+
}
|
| 2629 |
+
}
|
| 2630 |
+
|
| 2631 |
+
#if TARGET_OS_OSX
|
| 2632 |
+
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
| 2633 |
+
device.currentAllocatedSize / 1024.0 / 1024.0,
|
| 2634 |
+
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
| 2635 |
+
|
| 2636 |
+
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
| 2637 |
+
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
| 2638 |
+
} else {
|
| 2639 |
+
GGML_METAL_LOG_INFO("\n");
|
| 2640 |
+
}
|
| 2641 |
+
#else
|
| 2642 |
+
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
| 2643 |
+
#endif
|
| 2644 |
+
|
| 2645 |
+
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
| 2646 |
+
}
|
| 2647 |
+
|
| 2648 |
+
// backend
|
| 2649 |
+
|
| 2650 |
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
| 2651 |
return "Metal";
|
| 2652 |
|
|
|
|
| 2659 |
free(backend);
|
| 2660 |
}
|
| 2661 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2662 |
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
| 2663 |
return ggml_backend_metal_buffer_type();
|
| 2664 |
|
|
|
|
| 2685 |
/* .get_tensor_async = */ NULL,
|
| 2686 |
/* .cpy_tensor_from_async = */ NULL,
|
| 2687 |
/* .cpy_tensor_to_async = */ NULL,
|
| 2688 |
+
/* .synchronize = */ NULL,
|
| 2689 |
+
/* .graph_plan_create = */ NULL,
|
| 2690 |
/* .graph_plan_free = */ NULL,
|
| 2691 |
/* .graph_plan_compute = */ NULL,
|
| 2692 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
| 2693 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
| 2694 |
};
|
| 2695 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2696 |
ggml_backend_t ggml_backend_metal_init(void) {
|
|
|
|
|
|
|
| 2697 |
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
| 2698 |
|
| 2699 |
if (ctx == NULL) {
|
ggml-metal.metal
CHANGED
|
@@ -1702,8 +1702,9 @@ kernel void kernel_rope(
|
|
| 1702 |
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
| 1703 |
}
|
| 1704 |
} else {
|
| 1705 |
-
for (int64_t
|
| 1706 |
-
|
|
|
|
| 1707 |
|
| 1708 |
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
| 1709 |
const float cur_rot = inv_ndims*ic - ib;
|
|
@@ -1722,6 +1723,14 @@ kernel void kernel_rope(
|
|
| 1722 |
|
| 1723 |
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 1724 |
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1725 |
}
|
| 1726 |
}
|
| 1727 |
}
|
|
|
|
| 1702 |
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
| 1703 |
}
|
| 1704 |
} else {
|
| 1705 |
+
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
| 1706 |
+
if (ic < n_dims) {
|
| 1707 |
+
const int64_t ib = 0;
|
| 1708 |
|
| 1709 |
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
| 1710 |
const float cur_rot = inv_ndims*ic - ib;
|
|
|
|
| 1723 |
|
| 1724 |
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 1725 |
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 1726 |
+
} else {
|
| 1727 |
+
const int64_t i0 = ic;
|
| 1728 |
+
|
| 1729 |
+
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
| 1730 |
+
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
| 1731 |
+
|
| 1732 |
+
dst_data[0] = src[0];
|
| 1733 |
+
dst_data[1] = src[1];
|
| 1734 |
}
|
| 1735 |
}
|
| 1736 |
}
|
ggml-quants.c
CHANGED
|
@@ -3677,7 +3677,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 3677 |
|
| 3678 |
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
| 3679 |
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
| 3680 |
-
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
| 3681 |
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
| 3682 |
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
| 3683 |
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
|
@@ -6626,7 +6626,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
| 6626 |
|
| 6627 |
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
| 6628 |
const int8x16_t scales = vld1q_s8(scale);
|
| 6629 |
-
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
| 6630 |
|
| 6631 |
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
| 6632 |
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
|
|
|
| 3677 |
|
| 3678 |
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
| 3679 |
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
| 3680 |
+
const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
|
| 3681 |
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
| 3682 |
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
| 3683 |
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
|
|
|
| 6626 |
|
| 6627 |
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
| 6628 |
const int8x16_t scales = vld1q_s8(scale);
|
| 6629 |
+
const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
|
| 6630 |
|
| 6631 |
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
| 6632 |
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
ggml.c
CHANGED
|
@@ -1997,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
| 1997 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 1998 |
}
|
| 1999 |
|
| 2000 |
-
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
| 2001 |
-
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2002 |
-
|
| 2003 |
-
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
| 2004 |
-
}
|
| 2005 |
-
|
| 2006 |
int ggml_blck_size(enum ggml_type type) {
|
| 2007 |
return type_traits[type].blck_size;
|
| 2008 |
}
|
|
@@ -2011,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
|
|
| 2011 |
return type_traits[type].type_size;
|
| 2012 |
}
|
| 2013 |
|
| 2014 |
-
|
| 2015 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2016 |
}
|
| 2017 |
|
| 2018 |
const char * ggml_type_name(enum ggml_type type) {
|
|
@@ -2049,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|
| 2049 |
return ggml_type_size(tensor->type);
|
| 2050 |
}
|
| 2051 |
|
| 2052 |
-
|
| 2053 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2054 |
|
| 2055 |
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 2056 |
}
|
| 2057 |
|
| 2058 |
-
|
| 2059 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2060 |
|
| 2061 |
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 2062 |
}
|
| 2063 |
|
| 2064 |
-
|
| 2065 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2066 |
|
| 2067 |
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 2068 |
}
|
| 2069 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2070 |
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 2071 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2072 |
|
|
@@ -2371,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
|
| 2371 |
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
| 2372 |
size_t max_size = 0;
|
| 2373 |
|
| 2374 |
-
struct
|
| 2375 |
-
|
| 2376 |
-
while (obj != NULL) {
|
| 2377 |
-
if (obj->type == GGML_OBJECT_TENSOR) {
|
| 2378 |
-
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
| 2379 |
-
|
| 2380 |
-
const size_t size = ggml_nbytes(tensor);
|
| 2381 |
-
|
| 2382 |
-
if (max_size < size) {
|
| 2383 |
-
max_size = size;
|
| 2384 |
-
}
|
| 2385 |
-
}
|
| 2386 |
-
|
| 2387 |
-
obj = obj->next;
|
| 2388 |
}
|
| 2389 |
|
| 2390 |
return max_size;
|
|
@@ -2473,7 +2473,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
| 2473 |
view_src = view_src->view_src;
|
| 2474 |
}
|
| 2475 |
|
| 2476 |
-
size_t data_size =
|
| 2477 |
for (int i = 1; i < n_dims; i++) {
|
| 2478 |
data_size *= ne[i];
|
| 2479 |
}
|
|
@@ -2516,7 +2516,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
| 2516 |
/*.type =*/ type,
|
| 2517 |
/*.backend =*/ GGML_BACKEND_CPU,
|
| 2518 |
/*.buffer =*/ NULL,
|
| 2519 |
-
/*.n_dims =*/ n_dims,
|
| 2520 |
/*.ne =*/ { 1, 1, 1, 1 },
|
| 2521 |
/*.nb =*/ { 0, 0, 0, 0 },
|
| 2522 |
/*.op =*/ GGML_OP_NONE,
|
|
@@ -2623,7 +2622,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
| 2623 |
}
|
| 2624 |
|
| 2625 |
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
| 2626 |
-
return ggml_new_tensor(ctx, src->type,
|
| 2627 |
}
|
| 2628 |
|
| 2629 |
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
|
@@ -3072,7 +3071,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
| 3072 |
struct ggml_tensor * ggml_view_tensor(
|
| 3073 |
struct ggml_context * ctx,
|
| 3074 |
struct ggml_tensor * src) {
|
| 3075 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type,
|
| 3076 |
ggml_format_name(result, "%s (view)", src->name);
|
| 3077 |
|
| 3078 |
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
@@ -3082,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
|
|
| 3082 |
return result;
|
| 3083 |
}
|
| 3084 |
|
| 3085 |
-
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
| 3086 |
struct ggml_object * obj = ctx->objects_begin;
|
| 3087 |
|
| 3088 |
char * const mem_buffer = ctx->mem_buffer;
|
|
@@ -3098,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
|
| 3098 |
return NULL;
|
| 3099 |
}
|
| 3100 |
|
| 3101 |
-
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
| 3102 |
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
| 3103 |
obj = obj->next;
|
| 3104 |
|
|
@@ -3230,10 +3229,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|
| 3230 |
is_node = true;
|
| 3231 |
}
|
| 3232 |
|
| 3233 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, type,
|
| 3234 |
|
| 3235 |
result->op = GGML_OP_ADD;
|
| 3236 |
-
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32,
|
| 3237 |
result->src[0] = a;
|
| 3238 |
result->src[1] = b;
|
| 3239 |
|
|
@@ -3602,12 +3601,12 @@ struct ggml_tensor * ggml_sum_rows(
|
|
| 3602 |
is_node = true;
|
| 3603 |
}
|
| 3604 |
|
| 3605 |
-
int64_t ne[
|
| 3606 |
-
for (int i=1; i<
|
| 3607 |
ne[i] = a->ne[i];
|
| 3608 |
}
|
| 3609 |
|
| 3610 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
| 3611 |
|
| 3612 |
result->op = GGML_OP_SUM_ROWS;
|
| 3613 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -3628,8 +3627,8 @@ struct ggml_tensor * ggml_mean(
|
|
| 3628 |
is_node = true;
|
| 3629 |
}
|
| 3630 |
|
| 3631 |
-
int64_t ne[
|
| 3632 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
| 3633 |
|
| 3634 |
result->op = GGML_OP_MEAN;
|
| 3635 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -3651,8 +3650,7 @@ struct ggml_tensor * ggml_argmax(
|
|
| 3651 |
is_node = true;
|
| 3652 |
}
|
| 3653 |
|
| 3654 |
-
|
| 3655 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
| 3656 |
|
| 3657 |
result->op = GGML_OP_ARGMAX;
|
| 3658 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -3675,7 +3673,7 @@ struct ggml_tensor * ggml_repeat(
|
|
| 3675 |
is_node = true;
|
| 3676 |
}
|
| 3677 |
|
| 3678 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
| 3679 |
|
| 3680 |
result->op = GGML_OP_REPEAT;
|
| 3681 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -3702,7 +3700,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
| 3702 |
return a;
|
| 3703 |
}
|
| 3704 |
|
| 3705 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
| 3706 |
|
| 3707 |
result->op = GGML_OP_REPEAT_BACK;
|
| 3708 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -4078,7 +4076,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
| 4078 |
}
|
| 4079 |
|
| 4080 |
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
| 4081 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
| 4082 |
|
| 4083 |
result->op = GGML_OP_MUL_MAT;
|
| 4084 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -4088,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat(
|
|
| 4088 |
return result;
|
| 4089 |
}
|
| 4090 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4091 |
// ggml_mul_mat_id
|
| 4092 |
|
| 4093 |
struct ggml_tensor * ggml_mul_mat_id(
|
|
@@ -4112,7 +4118,7 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
| 4112 |
}
|
| 4113 |
|
| 4114 |
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
| 4115 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
| 4116 |
|
| 4117 |
ggml_set_op_params_i32(result, 0, id);
|
| 4118 |
ggml_set_op_params_i32(result, 1, n_as);
|
|
@@ -4150,7 +4156,7 @@ struct ggml_tensor * ggml_out_prod(
|
|
| 4150 |
|
| 4151 |
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
| 4152 |
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
| 4153 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
| 4154 |
|
| 4155 |
result->op = GGML_OP_OUT_PROD;
|
| 4156 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -4165,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
|
|
| 4165 |
static struct ggml_tensor * ggml_scale_impl(
|
| 4166 |
struct ggml_context * ctx,
|
| 4167 |
struct ggml_tensor * a,
|
| 4168 |
-
|
| 4169 |
bool inplace) {
|
| 4170 |
-
GGML_ASSERT(ggml_is_scalar(b));
|
| 4171 |
GGML_ASSERT(ggml_is_padded_1d(a));
|
| 4172 |
|
| 4173 |
bool is_node = false;
|
| 4174 |
|
| 4175 |
-
if (a->grad
|
| 4176 |
is_node = true;
|
| 4177 |
}
|
| 4178 |
|
| 4179 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 4180 |
|
|
|
|
|
|
|
| 4181 |
result->op = GGML_OP_SCALE;
|
| 4182 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 4183 |
result->src[0] = a;
|
| 4184 |
-
result->src[1] = b;
|
| 4185 |
|
| 4186 |
return result;
|
| 4187 |
}
|
|
@@ -4189,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
|
|
| 4189 |
struct ggml_tensor * ggml_scale(
|
| 4190 |
struct ggml_context * ctx,
|
| 4191 |
struct ggml_tensor * a,
|
| 4192 |
-
|
| 4193 |
-
return ggml_scale_impl(ctx, a,
|
| 4194 |
}
|
| 4195 |
|
| 4196 |
struct ggml_tensor * ggml_scale_inplace(
|
| 4197 |
struct ggml_context * ctx,
|
| 4198 |
struct ggml_tensor * a,
|
| 4199 |
-
|
| 4200 |
-
return ggml_scale_impl(ctx, a,
|
| 4201 |
}
|
| 4202 |
|
| 4203 |
// ggml_set
|
|
@@ -4435,7 +4441,7 @@ struct ggml_tensor * ggml_reshape(
|
|
| 4435 |
//GGML_ASSERT(false);
|
| 4436 |
}
|
| 4437 |
|
| 4438 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type,
|
| 4439 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 4440 |
|
| 4441 |
result->op = GGML_OP_RESHAPE;
|
|
@@ -4813,7 +4819,7 @@ struct ggml_tensor * ggml_diag(
|
|
| 4813 |
}
|
| 4814 |
|
| 4815 |
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
| 4816 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
| 4817 |
|
| 4818 |
result->op = GGML_OP_DIAG;
|
| 4819 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -5460,7 +5466,7 @@ struct ggml_tensor * ggml_pool_1d(
|
|
| 5460 |
is_node = true;
|
| 5461 |
}
|
| 5462 |
|
| 5463 |
-
const int64_t ne[
|
| 5464 |
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
| 5465 |
a->ne[1],
|
| 5466 |
};
|
|
@@ -5579,7 +5585,7 @@ struct ggml_tensor * ggml_argsort(
|
|
| 5579 |
enum ggml_sort_order order) {
|
| 5580 |
bool is_node = false;
|
| 5581 |
|
| 5582 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32,
|
| 5583 |
|
| 5584 |
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
| 5585 |
|
|
@@ -5626,7 +5632,7 @@ struct ggml_tensor * ggml_flash_attn(
|
|
| 5626 |
}
|
| 5627 |
|
| 5628 |
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
| 5629 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
| 5630 |
|
| 5631 |
int32_t t = masked ? 1 : 0;
|
| 5632 |
ggml_set_op_params(result, &t, sizeof(t));
|
|
@@ -5659,7 +5665,7 @@ struct ggml_tensor * ggml_flash_ff(
|
|
| 5659 |
}
|
| 5660 |
|
| 5661 |
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
| 5662 |
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
| 5663 |
|
| 5664 |
result->op = GGML_OP_FLASH_FF;
|
| 5665 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
@@ -5775,7 +5781,6 @@ struct ggml_tensor * ggml_win_part(
|
|
| 5775 |
const int np = npx*npy;
|
| 5776 |
|
| 5777 |
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
| 5778 |
-
|
| 5779 |
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
| 5780 |
|
| 5781 |
int32_t params[] = { npx, npy, w };
|
|
@@ -7759,10 +7764,10 @@ static void ggml_compute_forward_mul_f32(
|
|
| 7759 |
const int ith = params->ith;
|
| 7760 |
const int nth = params->nth;
|
| 7761 |
|
| 7762 |
-
// TODO: OpenCL kernel support broadcast
|
| 7763 |
#ifdef GGML_USE_CLBLAST
|
| 7764 |
if (src1->backend == GGML_BACKEND_GPU) {
|
| 7765 |
-
|
|
|
|
| 7766 |
if (ith == 0) {
|
| 7767 |
ggml_cl_mul(src0, src1, dst);
|
| 7768 |
}
|
|
@@ -9159,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
|
|
| 9159 |
float eps;
|
| 9160 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 9161 |
|
|
|
|
|
|
|
| 9162 |
// TODO: optimize
|
| 9163 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 9164 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
@@ -9228,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
| 9228 |
float eps;
|
| 9229 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 9230 |
|
|
|
|
|
|
|
| 9231 |
// TODO: optimize
|
| 9232 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 9233 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
@@ -9571,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
| 9571 |
}
|
| 9572 |
#endif
|
| 9573 |
|
| 9574 |
-
// off1 = offset in i11 and i1
|
| 9575 |
-
// cne1 = ne11 and ne1
|
| 9576 |
-
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
| 9577 |
-
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
| 9578 |
static void ggml_compute_forward_mul_mat(
|
| 9579 |
const struct ggml_compute_params * params,
|
| 9580 |
const struct ggml_tensor * src0,
|
| 9581 |
const struct ggml_tensor * src1,
|
| 9582 |
-
struct ggml_tensor * dst
|
| 9583 |
-
int64_t off1, int64_t cne1) {
|
| 9584 |
int64_t t0 = ggml_perf_time_us();
|
| 9585 |
UNUSED(t0);
|
| 9586 |
|
|
@@ -9648,9 +9652,9 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9648 |
const int64_t i03 = i13/r3;
|
| 9649 |
const int64_t i02 = i12/r2;
|
| 9650 |
|
| 9651 |
-
const void * x = (char *) src0->data +
|
| 9652 |
-
const float * y = (float *) ((char *) src1->data +
|
| 9653 |
-
float * d = (float *) ((char *) dst->data +
|
| 9654 |
|
| 9655 |
if (type != GGML_TYPE_F32) {
|
| 9656 |
float * const wdata = params->wdata;
|
|
@@ -9667,7 +9671,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9667 |
}
|
| 9668 |
|
| 9669 |
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
| 9670 |
-
|
| 9671 |
1.0f, y, ne10,
|
| 9672 |
x, ne00,
|
| 9673 |
0.0f, d, ne01);
|
|
@@ -9683,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9683 |
if (params->type == GGML_TASK_INIT) {
|
| 9684 |
if (src1->type != vec_dot_type) {
|
| 9685 |
char * wdata = params->wdata;
|
| 9686 |
-
const size_t row_size =
|
| 9687 |
|
| 9688 |
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
| 9689 |
assert(src1->type == GGML_TYPE_F32);
|
|
@@ -9706,10 +9710,10 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9706 |
}
|
| 9707 |
|
| 9708 |
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
| 9709 |
-
const size_t row_size =
|
| 9710 |
|
| 9711 |
-
const int64_t nr0 = ne01;
|
| 9712 |
-
const int64_t nr1 =
|
| 9713 |
|
| 9714 |
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
| 9715 |
|
|
@@ -9751,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9751 |
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
| 9752 |
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
| 9753 |
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
| 9754 |
-
const int64_t i13 = (ir1/(ne12*
|
| 9755 |
-
const int64_t i12 = (ir1 - i13*ne12*
|
| 9756 |
-
const int64_t i11 = (ir1 - i13*ne12*
|
| 9757 |
|
| 9758 |
// broadcast src0 into src1
|
| 9759 |
const int64_t i03 = i13/r3;
|
|
@@ -9793,28 +9797,191 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9793 |
|
| 9794 |
static void ggml_compute_forward_mul_mat_id(
|
| 9795 |
const struct ggml_compute_params * params,
|
| 9796 |
-
const struct ggml_tensor *
|
| 9797 |
const struct ggml_tensor * src1,
|
| 9798 |
struct ggml_tensor * dst) {
|
| 9799 |
|
| 9800 |
-
|
| 9801 |
-
|
| 9802 |
-
|
| 9803 |
-
|
| 9804 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9805 |
|
| 9806 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9807 |
const int id = ggml_get_op_params_i32(dst, 0);
|
| 9808 |
const int n_as = ggml_get_op_params_i32(dst, 1);
|
| 9809 |
|
| 9810 |
-
|
| 9811 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9812 |
|
| 9813 |
-
|
|
|
|
|
|
|
|
|
|
| 9814 |
|
| 9815 |
-
|
| 9816 |
-
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
| 9817 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9818 |
}
|
| 9819 |
|
| 9820 |
// ggml_compute_forward_out_prod
|
|
@@ -10158,19 +10325,18 @@ static void ggml_compute_forward_out_prod(
|
|
| 10158 |
static void ggml_compute_forward_scale_f32(
|
| 10159 |
const struct ggml_compute_params * params,
|
| 10160 |
const struct ggml_tensor * src0,
|
| 10161 |
-
const struct ggml_tensor * src1,
|
| 10162 |
struct ggml_tensor * dst) {
|
| 10163 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 10164 |
GGML_ASSERT(ggml_is_contiguous(dst));
|
| 10165 |
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
| 10166 |
-
GGML_ASSERT(ggml_is_scalar(src1));
|
| 10167 |
|
| 10168 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10169 |
return;
|
| 10170 |
}
|
| 10171 |
|
| 10172 |
// scale factor
|
| 10173 |
-
|
|
|
|
| 10174 |
|
| 10175 |
const int ith = params->ith;
|
| 10176 |
const int nth = params->nth;
|
|
@@ -10201,12 +10367,11 @@ static void ggml_compute_forward_scale_f32(
|
|
| 10201 |
static void ggml_compute_forward_scale(
|
| 10202 |
const struct ggml_compute_params * params,
|
| 10203 |
const struct ggml_tensor * src0,
|
| 10204 |
-
const struct ggml_tensor * src1,
|
| 10205 |
struct ggml_tensor * dst) {
|
| 10206 |
switch (src0->type) {
|
| 10207 |
case GGML_TYPE_F32:
|
| 10208 |
{
|
| 10209 |
-
ggml_compute_forward_scale_f32(params, src0,
|
| 10210 |
} break;
|
| 10211 |
default:
|
| 10212 |
{
|
|
@@ -11395,10 +11560,13 @@ static void ggml_compute_forward_rope_f32(
|
|
| 11395 |
}
|
| 11396 |
} else {
|
| 11397 |
// TODO: this might be wrong for ne0 != n_dims - need double check
|
| 11398 |
-
//
|
|
|
|
| 11399 |
theta_base *= freq_scale;
|
| 11400 |
-
for (int64_t
|
| 11401 |
-
|
|
|
|
|
|
|
| 11402 |
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
| 11403 |
float cur_rot = inv_ndims * ic - ib;
|
| 11404 |
|
|
@@ -11421,6 +11589,14 @@ static void ggml_compute_forward_rope_f32(
|
|
| 11421 |
|
| 11422 |
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 11423 |
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11424 |
}
|
| 11425 |
}
|
| 11426 |
}
|
|
@@ -11548,10 +11724,13 @@ static void ggml_compute_forward_rope_f16(
|
|
| 11548 |
}
|
| 11549 |
} else {
|
| 11550 |
// TODO: this might be wrong for ne0 != n_dims - need double check
|
| 11551 |
-
//
|
|
|
|
| 11552 |
theta_base *= freq_scale;
|
| 11553 |
-
for (int64_t
|
| 11554 |
-
|
|
|
|
|
|
|
| 11555 |
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
| 11556 |
float cur_rot = inv_ndims * ic - ib;
|
| 11557 |
|
|
@@ -11574,6 +11753,14 @@ static void ggml_compute_forward_rope_f16(
|
|
| 11574 |
|
| 11575 |
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
| 11576 |
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11577 |
}
|
| 11578 |
}
|
| 11579 |
}
|
|
@@ -14182,7 +14369,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 14182 |
} break;
|
| 14183 |
case GGML_OP_MUL_MAT:
|
| 14184 |
{
|
| 14185 |
-
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor
|
| 14186 |
} break;
|
| 14187 |
case GGML_OP_MUL_MAT_ID:
|
| 14188 |
{
|
|
@@ -14194,7 +14381,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 14194 |
} break;
|
| 14195 |
case GGML_OP_SCALE:
|
| 14196 |
{
|
| 14197 |
-
ggml_compute_forward_scale(params, tensor->src[0], tensor
|
| 14198 |
} break;
|
| 14199 |
case GGML_OP_SET:
|
| 14200 |
{
|
|
@@ -14558,7 +14745,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
| 14558 |
return replacements->vals[i];
|
| 14559 |
}
|
| 14560 |
|
| 14561 |
-
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type,
|
| 14562 |
|
| 14563 |
// insert clone into replacements
|
| 14564 |
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
|
@@ -14650,7 +14837,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
|
|
| 14650 |
|
| 14651 |
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
| 14652 |
if (ggml_hash_contains(zero_table, a)) {
|
| 14653 |
-
struct ggml_tensor * a_zero = ggml_scale(ctx, a,
|
| 14654 |
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
| 14655 |
} else {
|
| 14656 |
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
|
|
@@ -14786,7 +14973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
| 14786 |
src0->grad,
|
| 14787 |
ggml_scale(ctx,
|
| 14788 |
ggml_mul(ctx, src0, tensor->grad),
|
| 14789 |
-
|
| 14790 |
zero_table);
|
| 14791 |
}
|
| 14792 |
} break;
|
|
@@ -14800,7 +14987,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
| 14800 |
ggml_div(ctx,
|
| 14801 |
tensor->grad,
|
| 14802 |
tensor),
|
| 14803 |
-
|
| 14804 |
zero_table);
|
| 14805 |
}
|
| 14806 |
} break;
|
|
@@ -14966,17 +15153,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
| 14966 |
{
|
| 14967 |
// necessary for llama
|
| 14968 |
if (src0->grad) {
|
|
|
|
|
|
|
|
|
|
| 14969 |
src0->grad =
|
| 14970 |
ggml_add_or_set(ctx,
|
| 14971 |
src0->grad,
|
| 14972 |
-
ggml_scale_impl(ctx, tensor->grad,
|
| 14973 |
-
zero_table);
|
| 14974 |
-
}
|
| 14975 |
-
if (src1->grad) {
|
| 14976 |
-
src1->grad =
|
| 14977 |
-
ggml_add_or_set(ctx,
|
| 14978 |
-
src1->grad,
|
| 14979 |
-
ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
|
| 14980 |
zero_table);
|
| 14981 |
}
|
| 14982 |
} break;
|
|
@@ -15154,6 +15337,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
| 15154 |
const int n_past = ((int32_t *) tensor->op_params)[0];
|
| 15155 |
src0->grad =
|
| 15156 |
ggml_add_or_set(ctx, src0->grad,
|
|
|
|
|
|
|
| 15157 |
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
|
| 15158 |
zero_table);
|
| 15159 |
}
|
|
@@ -15982,7 +16167,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 15982 |
} break;
|
| 15983 |
case GGML_OP_MUL_MAT_ID:
|
| 15984 |
{
|
| 15985 |
-
// FIXME: blas
|
| 15986 |
n_tasks = n_threads;
|
| 15987 |
} break;
|
| 15988 |
case GGML_OP_OUT_PROD:
|
|
@@ -16311,25 +16495,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
| 16311 |
} else
|
| 16312 |
#endif
|
| 16313 |
if (node->src[1]->type != vec_dot_type) {
|
| 16314 |
-
cur =
|
| 16315 |
}
|
| 16316 |
} break;
|
| 16317 |
case GGML_OP_MUL_MAT_ID:
|
| 16318 |
{
|
| 16319 |
-
const struct ggml_tensor *
|
| 16320 |
-
const struct ggml_tensor *
|
| 16321 |
-
const enum ggml_type vec_dot_type = type_traits[
|
| 16322 |
-
|
| 16323 |
-
|
| 16324 |
-
if (a->type != GGML_TYPE_F32) {
|
| 16325 |
-
// here we need memory just for single 2D matrix from src0
|
| 16326 |
-
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
| 16327 |
-
}
|
| 16328 |
-
} else
|
| 16329 |
-
#endif
|
| 16330 |
-
if (b->type != vec_dot_type) {
|
| 16331 |
-
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
| 16332 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16333 |
} break;
|
| 16334 |
case GGML_OP_OUT_PROD:
|
| 16335 |
{
|
|
@@ -16559,7 +16739,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|
| 16559 |
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
| 16560 |
ggml_type_name(tensor->type),
|
| 16561 |
ggml_op_name (tensor->op),
|
| 16562 |
-
tensor
|
| 16563 |
ne[0], ne[1], ne[2], ne[3],
|
| 16564 |
nb[0], nb[1], nb[2], nb[3],
|
| 16565 |
tensor->data,
|
|
@@ -16574,7 +16754,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
| 16574 |
arg,
|
| 16575 |
ggml_type_name(tensor->type),
|
| 16576 |
ggml_op_name (tensor->op),
|
| 16577 |
-
tensor
|
| 16578 |
ne[0], ne[1], ne[2], ne[3],
|
| 16579 |
nb[0], nb[1], nb[2], nb[3],
|
| 16580 |
tensor->data,
|
|
@@ -16664,11 +16844,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
| 16664 |
|
| 16665 |
const uint32_t type = tensor->type;
|
| 16666 |
const uint32_t op = tensor->op;
|
| 16667 |
-
const uint32_t n_dims = tensor->n_dims;
|
| 16668 |
|
| 16669 |
fwrite(&type, sizeof(uint32_t), 1, fout);
|
| 16670 |
fwrite(&op, sizeof(uint32_t), 1, fout);
|
| 16671 |
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
| 16672 |
|
| 16673 |
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
| 16674 |
const uint64_t ne = tensor->ne[j];
|
|
@@ -16698,11 +16876,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
| 16698 |
|
| 16699 |
const uint32_t type = tensor->type;
|
| 16700 |
const uint32_t op = tensor->op;
|
| 16701 |
-
const uint32_t n_dims = tensor->n_dims;
|
| 16702 |
|
| 16703 |
fwrite(&type, sizeof(uint32_t), 1, fout);
|
| 16704 |
fwrite(&op, sizeof(uint32_t), 1, fout);
|
| 16705 |
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
| 16706 |
|
| 16707 |
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
| 16708 |
const uint64_t ne = tensor->ne[j];
|
|
@@ -16874,12 +17050,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
| 16874 |
{
|
| 16875 |
uint32_t type;
|
| 16876 |
uint32_t op;
|
| 16877 |
-
uint32_t n_dims;
|
| 16878 |
|
| 16879 |
for (uint32_t i = 0; i < n_leafs; ++i) {
|
| 16880 |
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
| 16881 |
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
| 16882 |
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
| 16883 |
|
| 16884 |
int64_t ne[GGML_MAX_DIMS];
|
| 16885 |
size_t nb[GGML_MAX_DIMS];
|
|
@@ -16895,7 +17069,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
| 16895 |
nb[j] = nb_cur;
|
| 16896 |
}
|
| 16897 |
|
| 16898 |
-
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
| 16899 |
|
| 16900 |
tensor->op = (enum ggml_op) op;
|
| 16901 |
|
|
@@ -16912,7 +17086,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
| 16912 |
|
| 16913 |
ptr += ggml_nbytes(tensor);
|
| 16914 |
|
| 16915 |
-
fprintf(stderr, "%s: loaded leaf %d: '%16s', %
|
| 16916 |
}
|
| 16917 |
}
|
| 16918 |
|
|
@@ -16922,12 +17096,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
| 16922 |
{
|
| 16923 |
uint32_t type;
|
| 16924 |
uint32_t op;
|
| 16925 |
-
uint32_t n_dims;
|
| 16926 |
|
| 16927 |
for (uint32_t i = 0; i < n_nodes; ++i) {
|
| 16928 |
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
| 16929 |
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
| 16930 |
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
| 16931 |
|
| 16932 |
enum ggml_op eop = (enum ggml_op) op;
|
| 16933 |
|
|
@@ -16998,7 +17170,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
| 16998 |
} break;
|
| 16999 |
default:
|
| 17000 |
{
|
| 17001 |
-
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
| 17002 |
|
| 17003 |
tensor->op = eop;
|
| 17004 |
} break;
|
|
@@ -17017,7 +17189,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
| 17017 |
|
| 17018 |
result->nodes[i] = tensor;
|
| 17019 |
|
| 17020 |
-
fprintf(stderr, "%s: loaded node %d: '%16s', %
|
| 17021 |
}
|
| 17022 |
}
|
| 17023 |
}
|
|
@@ -17155,7 +17327,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
| 17155 |
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
| 17156 |
}
|
| 17157 |
|
| 17158 |
-
if (node
|
| 17159 |
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
| 17160 |
} else {
|
| 17161 |
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
|
@@ -17422,7 +17594,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
| 17422 |
int64_t i = 0;
|
| 17423 |
for (int p = 0; p < np; ++p) {
|
| 17424 |
const int64_t ne = ggml_nelements(ps[p]);
|
| 17425 |
-
const float p_decay = ((ps[p]
|
| 17426 |
for (int64_t j = 0; j < ne; ++j) {
|
| 17427 |
float x = ggml_get_f32_1d(ps[p], j);
|
| 17428 |
float g_ = g[i]*gnorm;
|
|
@@ -18696,7 +18868,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
| 18696 |
return NULL;
|
| 18697 |
}
|
| 18698 |
|
| 18699 |
-
const size_t size_cur = (
|
| 18700 |
|
| 18701 |
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
| 18702 |
}
|
|
@@ -19025,6 +19197,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
|
| 19025 |
return ctx->infos[i].name.data;
|
| 19026 |
}
|
| 19027 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19028 |
// returns the index
|
| 19029 |
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
| 19030 |
const int idx = gguf_find_key(ctx, key);
|
|
@@ -19200,8 +19376,8 @@ void gguf_add_tensor(
|
|
| 19200 |
ctx->infos[idx].ne[i] = 1;
|
| 19201 |
}
|
| 19202 |
|
| 19203 |
-
ctx->infos[idx].n_dims = tensor
|
| 19204 |
-
for (
|
| 19205 |
ctx->infos[idx].ne[i] = tensor->ne[i];
|
| 19206 |
}
|
| 19207 |
|
|
|
|
| 1997 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 1998 |
}
|
| 1999 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2000 |
int ggml_blck_size(enum ggml_type type) {
|
| 2001 |
return type_traits[type].blck_size;
|
| 2002 |
}
|
|
|
|
| 2005 |
return type_traits[type].type_size;
|
| 2006 |
}
|
| 2007 |
|
| 2008 |
+
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
| 2009 |
+
assert(ne % ggml_blck_size(type) == 0);
|
| 2010 |
+
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
| 2011 |
+
}
|
| 2012 |
+
|
| 2013 |
+
double ggml_type_sizef(enum ggml_type type) {
|
| 2014 |
+
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
| 2015 |
}
|
| 2016 |
|
| 2017 |
const char * ggml_type_name(enum ggml_type type) {
|
|
|
|
| 2048 |
return ggml_type_size(tensor->type);
|
| 2049 |
}
|
| 2050 |
|
| 2051 |
+
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
| 2052 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2053 |
|
| 2054 |
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 2055 |
}
|
| 2056 |
|
| 2057 |
+
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
| 2058 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2059 |
|
| 2060 |
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 2061 |
}
|
| 2062 |
|
| 2063 |
+
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
| 2064 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2065 |
|
| 2066 |
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 2067 |
}
|
| 2068 |
|
| 2069 |
+
bool ggml_is_3d(const struct ggml_tensor * tensor) {
|
| 2070 |
+
return tensor->ne[3] == 1;
|
| 2071 |
+
}
|
| 2072 |
+
|
| 2073 |
+
int ggml_n_dims(const struct ggml_tensor * tensor) {
|
| 2074 |
+
for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
|
| 2075 |
+
if (tensor->ne[i] > 1) {
|
| 2076 |
+
return i + 1;
|
| 2077 |
+
}
|
| 2078 |
+
}
|
| 2079 |
+
return 1;
|
| 2080 |
+
}
|
| 2081 |
+
|
| 2082 |
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 2083 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 2084 |
|
|
|
|
| 2383 |
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
| 2384 |
size_t max_size = 0;
|
| 2385 |
|
| 2386 |
+
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
| 2387 |
+
max_size = MAX(max_size, ggml_nbytes(tensor));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2388 |
}
|
| 2389 |
|
| 2390 |
return max_size;
|
|
|
|
| 2473 |
view_src = view_src->view_src;
|
| 2474 |
}
|
| 2475 |
|
| 2476 |
+
size_t data_size = ggml_row_size(type, ne[0]);
|
| 2477 |
for (int i = 1; i < n_dims; i++) {
|
| 2478 |
data_size *= ne[i];
|
| 2479 |
}
|
|
|
|
| 2516 |
/*.type =*/ type,
|
| 2517 |
/*.backend =*/ GGML_BACKEND_CPU,
|
| 2518 |
/*.buffer =*/ NULL,
|
|
|
|
| 2519 |
/*.ne =*/ { 1, 1, 1, 1 },
|
| 2520 |
/*.nb =*/ { 0, 0, 0, 0 },
|
| 2521 |
/*.op =*/ GGML_OP_NONE,
|
|
|
|
| 2622 |
}
|
| 2623 |
|
| 2624 |
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
| 2625 |
+
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
|
| 2626 |
}
|
| 2627 |
|
| 2628 |
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
|
|
|
| 3071 |
struct ggml_tensor * ggml_view_tensor(
|
| 3072 |
struct ggml_context * ctx,
|
| 3073 |
struct ggml_tensor * src) {
|
| 3074 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
|
| 3075 |
ggml_format_name(result, "%s (view)", src->name);
|
| 3076 |
|
| 3077 |
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
|
|
| 3081 |
return result;
|
| 3082 |
}
|
| 3083 |
|
| 3084 |
+
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
|
| 3085 |
struct ggml_object * obj = ctx->objects_begin;
|
| 3086 |
|
| 3087 |
char * const mem_buffer = ctx->mem_buffer;
|
|
|
|
| 3097 |
return NULL;
|
| 3098 |
}
|
| 3099 |
|
| 3100 |
+
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
| 3101 |
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
| 3102 |
obj = obj->next;
|
| 3103 |
|
|
|
|
| 3229 |
is_node = true;
|
| 3230 |
}
|
| 3231 |
|
| 3232 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
| 3233 |
|
| 3234 |
result->op = GGML_OP_ADD;
|
| 3235 |
+
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
|
| 3236 |
result->src[0] = a;
|
| 3237 |
result->src[1] = b;
|
| 3238 |
|
|
|
|
| 3601 |
is_node = true;
|
| 3602 |
}
|
| 3603 |
|
| 3604 |
+
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
| 3605 |
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
| 3606 |
ne[i] = a->ne[i];
|
| 3607 |
}
|
| 3608 |
|
| 3609 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
| 3610 |
|
| 3611 |
result->op = GGML_OP_SUM_ROWS;
|
| 3612 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 3627 |
is_node = true;
|
| 3628 |
}
|
| 3629 |
|
| 3630 |
+
int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
| 3631 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
| 3632 |
|
| 3633 |
result->op = GGML_OP_MEAN;
|
| 3634 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 3650 |
is_node = true;
|
| 3651 |
}
|
| 3652 |
|
| 3653 |
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
|
|
|
|
| 3654 |
|
| 3655 |
result->op = GGML_OP_ARGMAX;
|
| 3656 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 3673 |
is_node = true;
|
| 3674 |
}
|
| 3675 |
|
| 3676 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
| 3677 |
|
| 3678 |
result->op = GGML_OP_REPEAT;
|
| 3679 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 3700 |
return a;
|
| 3701 |
}
|
| 3702 |
|
| 3703 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
| 3704 |
|
| 3705 |
result->op = GGML_OP_REPEAT_BACK;
|
| 3706 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 4076 |
}
|
| 4077 |
|
| 4078 |
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
| 4079 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
| 4080 |
|
| 4081 |
result->op = GGML_OP_MUL_MAT;
|
| 4082 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 4086 |
return result;
|
| 4087 |
}
|
| 4088 |
|
| 4089 |
+
void ggml_mul_mat_set_prec(
|
| 4090 |
+
struct ggml_tensor * a,
|
| 4091 |
+
enum ggml_prec prec) {
|
| 4092 |
+
const int32_t prec_i32 = (int32_t) prec;
|
| 4093 |
+
|
| 4094 |
+
ggml_set_op_params_i32(a, 0, prec_i32);
|
| 4095 |
+
}
|
| 4096 |
+
|
| 4097 |
// ggml_mul_mat_id
|
| 4098 |
|
| 4099 |
struct ggml_tensor * ggml_mul_mat_id(
|
|
|
|
| 4118 |
}
|
| 4119 |
|
| 4120 |
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
| 4121 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
| 4122 |
|
| 4123 |
ggml_set_op_params_i32(result, 0, id);
|
| 4124 |
ggml_set_op_params_i32(result, 1, n_as);
|
|
|
|
| 4156 |
|
| 4157 |
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
| 4158 |
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
| 4159 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
| 4160 |
|
| 4161 |
result->op = GGML_OP_OUT_PROD;
|
| 4162 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 4171 |
static struct ggml_tensor * ggml_scale_impl(
|
| 4172 |
struct ggml_context * ctx,
|
| 4173 |
struct ggml_tensor * a,
|
| 4174 |
+
float s,
|
| 4175 |
bool inplace) {
|
|
|
|
| 4176 |
GGML_ASSERT(ggml_is_padded_1d(a));
|
| 4177 |
|
| 4178 |
bool is_node = false;
|
| 4179 |
|
| 4180 |
+
if (a->grad) {
|
| 4181 |
is_node = true;
|
| 4182 |
}
|
| 4183 |
|
| 4184 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 4185 |
|
| 4186 |
+
ggml_set_op_params(result, &s, sizeof(s));
|
| 4187 |
+
|
| 4188 |
result->op = GGML_OP_SCALE;
|
| 4189 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 4190 |
result->src[0] = a;
|
|
|
|
| 4191 |
|
| 4192 |
return result;
|
| 4193 |
}
|
|
|
|
| 4195 |
struct ggml_tensor * ggml_scale(
|
| 4196 |
struct ggml_context * ctx,
|
| 4197 |
struct ggml_tensor * a,
|
| 4198 |
+
float s) {
|
| 4199 |
+
return ggml_scale_impl(ctx, a, s, false);
|
| 4200 |
}
|
| 4201 |
|
| 4202 |
struct ggml_tensor * ggml_scale_inplace(
|
| 4203 |
struct ggml_context * ctx,
|
| 4204 |
struct ggml_tensor * a,
|
| 4205 |
+
float s) {
|
| 4206 |
+
return ggml_scale_impl(ctx, a, s, true);
|
| 4207 |
}
|
| 4208 |
|
| 4209 |
// ggml_set
|
|
|
|
| 4441 |
//GGML_ASSERT(false);
|
| 4442 |
}
|
| 4443 |
|
| 4444 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
|
| 4445 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 4446 |
|
| 4447 |
result->op = GGML_OP_RESHAPE;
|
|
|
|
| 4819 |
}
|
| 4820 |
|
| 4821 |
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
| 4822 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
|
| 4823 |
|
| 4824 |
result->op = GGML_OP_DIAG;
|
| 4825 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 5466 |
is_node = true;
|
| 5467 |
}
|
| 5468 |
|
| 5469 |
+
const int64_t ne[2] = {
|
| 5470 |
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
| 5471 |
a->ne[1],
|
| 5472 |
};
|
|
|
|
| 5585 |
enum ggml_sort_order order) {
|
| 5586 |
bool is_node = false;
|
| 5587 |
|
| 5588 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
| 5589 |
|
| 5590 |
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
| 5591 |
|
|
|
|
| 5632 |
}
|
| 5633 |
|
| 5634 |
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
| 5635 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
| 5636 |
|
| 5637 |
int32_t t = masked ? 1 : 0;
|
| 5638 |
ggml_set_op_params(result, &t, sizeof(t));
|
|
|
|
| 5665 |
}
|
| 5666 |
|
| 5667 |
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
| 5668 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
| 5669 |
|
| 5670 |
result->op = GGML_OP_FLASH_FF;
|
| 5671 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
|
| 5781 |
const int np = npx*npy;
|
| 5782 |
|
| 5783 |
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
|
|
|
| 5784 |
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
| 5785 |
|
| 5786 |
int32_t params[] = { npx, npy, w };
|
|
|
|
| 7764 |
const int ith = params->ith;
|
| 7765 |
const int nth = params->nth;
|
| 7766 |
|
|
|
|
| 7767 |
#ifdef GGML_USE_CLBLAST
|
| 7768 |
if (src1->backend == GGML_BACKEND_GPU) {
|
| 7769 |
+
// TODO: OpenCL kernel support full broadcast
|
| 7770 |
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
| 7771 |
if (ith == 0) {
|
| 7772 |
ggml_cl_mul(src0, src1, dst);
|
| 7773 |
}
|
|
|
|
| 9164 |
float eps;
|
| 9165 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 9166 |
|
| 9167 |
+
GGML_ASSERT(eps > 0.0f);
|
| 9168 |
+
|
| 9169 |
// TODO: optimize
|
| 9170 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 9171 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
|
|
| 9235 |
float eps;
|
| 9236 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 9237 |
|
| 9238 |
+
GGML_ASSERT(eps > 0.0f);
|
| 9239 |
+
|
| 9240 |
// TODO: optimize
|
| 9241 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 9242 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
|
|
| 9580 |
}
|
| 9581 |
#endif
|
| 9582 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9583 |
static void ggml_compute_forward_mul_mat(
|
| 9584 |
const struct ggml_compute_params * params,
|
| 9585 |
const struct ggml_tensor * src0,
|
| 9586 |
const struct ggml_tensor * src1,
|
| 9587 |
+
struct ggml_tensor * dst) {
|
|
|
|
| 9588 |
int64_t t0 = ggml_perf_time_us();
|
| 9589 |
UNUSED(t0);
|
| 9590 |
|
|
|
|
| 9652 |
const int64_t i03 = i13/r3;
|
| 9653 |
const int64_t i02 = i12/r2;
|
| 9654 |
|
| 9655 |
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
| 9656 |
+
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
| 9657 |
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
| 9658 |
|
| 9659 |
if (type != GGML_TYPE_F32) {
|
| 9660 |
float * const wdata = params->wdata;
|
|
|
|
| 9671 |
}
|
| 9672 |
|
| 9673 |
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
| 9674 |
+
ne1, ne01, ne10,
|
| 9675 |
1.0f, y, ne10,
|
| 9676 |
x, ne00,
|
| 9677 |
0.0f, d, ne01);
|
|
|
|
| 9687 |
if (params->type == GGML_TASK_INIT) {
|
| 9688 |
if (src1->type != vec_dot_type) {
|
| 9689 |
char * wdata = params->wdata;
|
| 9690 |
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 9691 |
|
| 9692 |
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
| 9693 |
assert(src1->type == GGML_TYPE_F32);
|
|
|
|
| 9710 |
}
|
| 9711 |
|
| 9712 |
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
| 9713 |
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 9714 |
|
| 9715 |
+
const int64_t nr0 = ne01; // src0 rows
|
| 9716 |
+
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
| 9717 |
|
| 9718 |
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
| 9719 |
|
|
|
|
| 9755 |
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
| 9756 |
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
| 9757 |
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
| 9758 |
+
const int64_t i13 = (ir1/(ne12*ne1));
|
| 9759 |
+
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
| 9760 |
+
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
| 9761 |
|
| 9762 |
// broadcast src0 into src1
|
| 9763 |
const int64_t i03 = i13/r3;
|
|
|
|
| 9797 |
|
| 9798 |
static void ggml_compute_forward_mul_mat_id(
|
| 9799 |
const struct ggml_compute_params * params,
|
| 9800 |
+
const struct ggml_tensor * ids,
|
| 9801 |
const struct ggml_tensor * src1,
|
| 9802 |
struct ggml_tensor * dst) {
|
| 9803 |
|
| 9804 |
+
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
| 9805 |
+
|
| 9806 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 9807 |
+
|
| 9808 |
+
const int ith = params->ith;
|
| 9809 |
+
const int nth = params->nth;
|
| 9810 |
+
|
| 9811 |
+
const enum ggml_type type = src0->type;
|
| 9812 |
+
|
| 9813 |
+
const bool src1_cont = ggml_is_contiguous(src1);
|
| 9814 |
+
|
| 9815 |
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
| 9816 |
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
| 9817 |
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
| 9818 |
+
|
| 9819 |
+
GGML_ASSERT(ne0 == ne01);
|
| 9820 |
+
GGML_ASSERT(ne1 == ne11);
|
| 9821 |
+
GGML_ASSERT(ne2 == ne12);
|
| 9822 |
+
GGML_ASSERT(ne3 == ne13);
|
| 9823 |
+
|
| 9824 |
+
// we don't support permuted src0 or src1
|
| 9825 |
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
| 9826 |
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
| 9827 |
|
| 9828 |
+
// dst cannot be transposed or permuted
|
| 9829 |
+
GGML_ASSERT(nb0 == sizeof(float));
|
| 9830 |
+
GGML_ASSERT(nb0 <= nb1);
|
| 9831 |
+
GGML_ASSERT(nb1 <= nb2);
|
| 9832 |
+
GGML_ASSERT(nb2 <= nb3);
|
| 9833 |
+
|
| 9834 |
+
// broadcast factors
|
| 9835 |
+
const int64_t r2 = ne12/ne02;
|
| 9836 |
+
const int64_t r3 = ne13/ne03;
|
| 9837 |
+
|
| 9838 |
+
// row groups
|
| 9839 |
const int id = ggml_get_op_params_i32(dst, 0);
|
| 9840 |
const int n_as = ggml_get_op_params_i32(dst, 1);
|
| 9841 |
|
| 9842 |
+
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
| 9843 |
+
(char *) params->wdata :
|
| 9844 |
+
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
| 9845 |
+
|
| 9846 |
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
| 9847 |
+
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
|
| 9848 |
+
|
| 9849 |
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
| 9850 |
+
|
| 9851 |
+
if (params->type == GGML_TASK_INIT) {
|
| 9852 |
+
char * wdata = params->wdata;
|
| 9853 |
+
if (src1->type != vec_dot_type) {
|
| 9854 |
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 9855 |
+
|
| 9856 |
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
| 9857 |
+
assert(src1->type == GGML_TYPE_F32);
|
| 9858 |
+
|
| 9859 |
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
| 9860 |
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
| 9861 |
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
| 9862 |
+
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
| 9863 |
+
wdata += row_size;
|
| 9864 |
+
}
|
| 9865 |
+
}
|
| 9866 |
+
}
|
| 9867 |
+
}
|
| 9868 |
+
|
| 9869 |
+
// initialize matrix_row_counts
|
| 9870 |
+
GGML_ASSERT(wdata == wdata_src1_end);
|
| 9871 |
+
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
| 9872 |
+
|
| 9873 |
+
// group rows by src0 matrix
|
| 9874 |
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
| 9875 |
+
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
| 9876 |
|
| 9877 |
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
| 9878 |
+
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
|
| 9879 |
+
matrix_row_counts[row_id] += 1;
|
| 9880 |
+
}
|
| 9881 |
|
| 9882 |
+
return;
|
|
|
|
| 9883 |
}
|
| 9884 |
+
|
| 9885 |
+
if (params->type == GGML_TASK_FINALIZE) {
|
| 9886 |
+
return;
|
| 9887 |
+
}
|
| 9888 |
+
|
| 9889 |
+
// compute each matrix multiplication in sequence
|
| 9890 |
+
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
| 9891 |
+
const int64_t cne1 = matrix_row_counts[cur_a];
|
| 9892 |
+
|
| 9893 |
+
if (cne1 == 0) {
|
| 9894 |
+
continue;
|
| 9895 |
+
}
|
| 9896 |
+
|
| 9897 |
+
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
| 9898 |
+
|
| 9899 |
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
| 9900 |
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
| 9901 |
+
|
| 9902 |
+
const int64_t nr0 = ne01; // src0 rows
|
| 9903 |
+
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
| 9904 |
+
|
| 9905 |
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
| 9906 |
+
|
| 9907 |
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
| 9908 |
+
|
| 9909 |
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
| 9910 |
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
| 9911 |
+
|
| 9912 |
+
const int64_t ith0 = ith % nth0;
|
| 9913 |
+
const int64_t ith1 = ith / nth0;
|
| 9914 |
+
|
| 9915 |
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
| 9916 |
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
| 9917 |
+
|
| 9918 |
+
const int64_t ir010 = dr0*ith0;
|
| 9919 |
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
| 9920 |
+
|
| 9921 |
+
const int64_t ir110 = dr1*ith1;
|
| 9922 |
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
| 9923 |
+
|
| 9924 |
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
| 9925 |
+
|
| 9926 |
+
// threads with no work simply yield (not sure if it helps)
|
| 9927 |
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
| 9928 |
+
sched_yield();
|
| 9929 |
+
continue;
|
| 9930 |
+
}
|
| 9931 |
+
|
| 9932 |
+
assert(ne12 % ne02 == 0);
|
| 9933 |
+
assert(ne13 % ne03 == 0);
|
| 9934 |
+
|
| 9935 |
+
// block-tiling attempt
|
| 9936 |
+
const int64_t blck_0 = 16;
|
| 9937 |
+
const int64_t blck_1 = 16;
|
| 9938 |
+
|
| 9939 |
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
| 9940 |
+
float tmp[16];
|
| 9941 |
+
|
| 9942 |
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
| 9943 |
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
| 9944 |
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
| 9945 |
+
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
|
| 9946 |
+
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
| 9947 |
+
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
| 9948 |
+
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
| 9949 |
+
|
| 9950 |
+
// broadcast src0 into src1
|
| 9951 |
+
const int64_t i03 = i13/r3;
|
| 9952 |
+
const int64_t i02 = i12/r2;
|
| 9953 |
+
|
| 9954 |
+
const int64_t i1 = i11;
|
| 9955 |
+
const int64_t i2 = i12;
|
| 9956 |
+
const int64_t i3 = i13;
|
| 9957 |
+
|
| 9958 |
+
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
|
| 9959 |
+
|
| 9960 |
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
| 9961 |
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
| 9962 |
+
// the original src1 data pointer, so we should index using the indices directly
|
| 9963 |
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
| 9964 |
+
const char * src1_col = (const char *) wdata +
|
| 9965 |
+
(src1_cont || src1->type != vec_dot_type
|
| 9966 |
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
| 9967 |
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
| 9968 |
+
|
| 9969 |
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
| 9970 |
+
|
| 9971 |
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
| 9972 |
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
| 9973 |
+
//}
|
| 9974 |
+
|
| 9975 |
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
| 9976 |
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
| 9977 |
+
}
|
| 9978 |
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
| 9979 |
+
}
|
| 9980 |
+
}
|
| 9981 |
+
}
|
| 9982 |
+
}
|
| 9983 |
+
|
| 9984 |
+
#undef MMID_MATRIX_ROW
|
| 9985 |
}
|
| 9986 |
|
| 9987 |
// ggml_compute_forward_out_prod
|
|
|
|
| 10325 |
static void ggml_compute_forward_scale_f32(
|
| 10326 |
const struct ggml_compute_params * params,
|
| 10327 |
const struct ggml_tensor * src0,
|
|
|
|
| 10328 |
struct ggml_tensor * dst) {
|
| 10329 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 10330 |
GGML_ASSERT(ggml_is_contiguous(dst));
|
| 10331 |
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
|
|
| 10332 |
|
| 10333 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10334 |
return;
|
| 10335 |
}
|
| 10336 |
|
| 10337 |
// scale factor
|
| 10338 |
+
float v;
|
| 10339 |
+
memcpy(&v, dst->op_params, sizeof(float));
|
| 10340 |
|
| 10341 |
const int ith = params->ith;
|
| 10342 |
const int nth = params->nth;
|
|
|
|
| 10367 |
static void ggml_compute_forward_scale(
|
| 10368 |
const struct ggml_compute_params * params,
|
| 10369 |
const struct ggml_tensor * src0,
|
|
|
|
| 10370 |
struct ggml_tensor * dst) {
|
| 10371 |
switch (src0->type) {
|
| 10372 |
case GGML_TYPE_F32:
|
| 10373 |
{
|
| 10374 |
+
ggml_compute_forward_scale_f32(params, src0, dst);
|
| 10375 |
} break;
|
| 10376 |
default:
|
| 10377 |
{
|
|
|
|
| 11560 |
}
|
| 11561 |
} else {
|
| 11562 |
// TODO: this might be wrong for ne0 != n_dims - need double check
|
| 11563 |
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
| 11564 |
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
| 11565 |
theta_base *= freq_scale;
|
| 11566 |
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
| 11567 |
+
if (ic < n_dims) {
|
| 11568 |
+
const int64_t ib = 0;
|
| 11569 |
+
|
| 11570 |
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
| 11571 |
float cur_rot = inv_ndims * ic - ib;
|
| 11572 |
|
|
|
|
| 11589 |
|
| 11590 |
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 11591 |
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 11592 |
+
} else {
|
| 11593 |
+
const int64_t i0 = ic;
|
| 11594 |
+
|
| 11595 |
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
| 11596 |
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
| 11597 |
+
|
| 11598 |
+
dst_data[0] = src[0];
|
| 11599 |
+
dst_data[1] = src[1];
|
| 11600 |
}
|
| 11601 |
}
|
| 11602 |
}
|
|
|
|
| 11724 |
}
|
| 11725 |
} else {
|
| 11726 |
// TODO: this might be wrong for ne0 != n_dims - need double check
|
| 11727 |
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
| 11728 |
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
| 11729 |
theta_base *= freq_scale;
|
| 11730 |
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
| 11731 |
+
if (ic < n_dims) {
|
| 11732 |
+
const int64_t ib = 0;
|
| 11733 |
+
|
| 11734 |
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
| 11735 |
float cur_rot = inv_ndims * ic - ib;
|
| 11736 |
|
|
|
|
| 11753 |
|
| 11754 |
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
| 11755 |
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
| 11756 |
+
} else {
|
| 11757 |
+
const int64_t i0 = ic;
|
| 11758 |
+
|
| 11759 |
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
| 11760 |
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
| 11761 |
+
|
| 11762 |
+
dst_data[0] = src[0];
|
| 11763 |
+
dst_data[1] = src[1];
|
| 11764 |
}
|
| 11765 |
}
|
| 11766 |
}
|
|
|
|
| 14369 |
} break;
|
| 14370 |
case GGML_OP_MUL_MAT:
|
| 14371 |
{
|
| 14372 |
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
| 14373 |
} break;
|
| 14374 |
case GGML_OP_MUL_MAT_ID:
|
| 14375 |
{
|
|
|
|
| 14381 |
} break;
|
| 14382 |
case GGML_OP_SCALE:
|
| 14383 |
{
|
| 14384 |
+
ggml_compute_forward_scale(params, tensor->src[0], tensor);
|
| 14385 |
} break;
|
| 14386 |
case GGML_OP_SET:
|
| 14387 |
{
|
|
|
|
| 14745 |
return replacements->vals[i];
|
| 14746 |
}
|
| 14747 |
|
| 14748 |
+
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
|
| 14749 |
|
| 14750 |
// insert clone into replacements
|
| 14751 |
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
|
|
|
| 14837 |
|
| 14838 |
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
| 14839 |
if (ggml_hash_contains(zero_table, a)) {
|
| 14840 |
+
struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
|
| 14841 |
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
| 14842 |
} else {
|
| 14843 |
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
|
|
|
|
| 14973 |
src0->grad,
|
| 14974 |
ggml_scale(ctx,
|
| 14975 |
ggml_mul(ctx, src0, tensor->grad),
|
| 14976 |
+
2.0f),
|
| 14977 |
zero_table);
|
| 14978 |
}
|
| 14979 |
} break;
|
|
|
|
| 14987 |
ggml_div(ctx,
|
| 14988 |
tensor->grad,
|
| 14989 |
tensor),
|
| 14990 |
+
0.5f),
|
| 14991 |
zero_table);
|
| 14992 |
}
|
| 14993 |
} break;
|
|
|
|
| 15153 |
{
|
| 15154 |
// necessary for llama
|
| 15155 |
if (src0->grad) {
|
| 15156 |
+
float s;
|
| 15157 |
+
memcpy(&s, tensor->op_params, sizeof(float));
|
| 15158 |
+
|
| 15159 |
src0->grad =
|
| 15160 |
ggml_add_or_set(ctx,
|
| 15161 |
src0->grad,
|
| 15162 |
+
ggml_scale_impl(ctx, tensor->grad, s, false),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15163 |
zero_table);
|
| 15164 |
}
|
| 15165 |
} break;
|
|
|
|
| 15337 |
const int n_past = ((int32_t *) tensor->op_params)[0];
|
| 15338 |
src0->grad =
|
| 15339 |
ggml_add_or_set(ctx, src0->grad,
|
| 15340 |
+
/* ggml_diag_mask_inf_impl() shouldn't be here */
|
| 15341 |
+
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
|
| 15342 |
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
|
| 15343 |
zero_table);
|
| 15344 |
}
|
|
|
|
| 16167 |
} break;
|
| 16168 |
case GGML_OP_MUL_MAT_ID:
|
| 16169 |
{
|
|
|
|
| 16170 |
n_tasks = n_threads;
|
| 16171 |
} break;
|
| 16172 |
case GGML_OP_OUT_PROD:
|
|
|
|
| 16495 |
} else
|
| 16496 |
#endif
|
| 16497 |
if (node->src[1]->type != vec_dot_type) {
|
| 16498 |
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
| 16499 |
}
|
| 16500 |
} break;
|
| 16501 |
case GGML_OP_MUL_MAT_ID:
|
| 16502 |
{
|
| 16503 |
+
const struct ggml_tensor * src0 = node->src[2];
|
| 16504 |
+
const struct ggml_tensor * src1 = node->src[1];
|
| 16505 |
+
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
| 16506 |
+
if (src1->type != vec_dot_type) {
|
| 16507 |
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16508 |
}
|
| 16509 |
+
const int n_as = ggml_get_op_params_i32(node, 1);
|
| 16510 |
+
cur = GGML_PAD(cur, sizeof(int64_t)); // align
|
| 16511 |
+
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
| 16512 |
+
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
| 16513 |
} break;
|
| 16514 |
case GGML_OP_OUT_PROD:
|
| 16515 |
{
|
|
|
|
| 16739 |
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
| 16740 |
ggml_type_name(tensor->type),
|
| 16741 |
ggml_op_name (tensor->op),
|
| 16742 |
+
ggml_n_dims(tensor),
|
| 16743 |
ne[0], ne[1], ne[2], ne[3],
|
| 16744 |
nb[0], nb[1], nb[2], nb[3],
|
| 16745 |
tensor->data,
|
|
|
|
| 16754 |
arg,
|
| 16755 |
ggml_type_name(tensor->type),
|
| 16756 |
ggml_op_name (tensor->op),
|
| 16757 |
+
ggml_n_dims(tensor),
|
| 16758 |
ne[0], ne[1], ne[2], ne[3],
|
| 16759 |
nb[0], nb[1], nb[2], nb[3],
|
| 16760 |
tensor->data,
|
|
|
|
| 16844 |
|
| 16845 |
const uint32_t type = tensor->type;
|
| 16846 |
const uint32_t op = tensor->op;
|
|
|
|
| 16847 |
|
| 16848 |
fwrite(&type, sizeof(uint32_t), 1, fout);
|
| 16849 |
fwrite(&op, sizeof(uint32_t), 1, fout);
|
|
|
|
| 16850 |
|
| 16851 |
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
| 16852 |
const uint64_t ne = tensor->ne[j];
|
|
|
|
| 16876 |
|
| 16877 |
const uint32_t type = tensor->type;
|
| 16878 |
const uint32_t op = tensor->op;
|
|
|
|
| 16879 |
|
| 16880 |
fwrite(&type, sizeof(uint32_t), 1, fout);
|
| 16881 |
fwrite(&op, sizeof(uint32_t), 1, fout);
|
|
|
|
| 16882 |
|
| 16883 |
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
| 16884 |
const uint64_t ne = tensor->ne[j];
|
|
|
|
| 17050 |
{
|
| 17051 |
uint32_t type;
|
| 17052 |
uint32_t op;
|
|
|
|
| 17053 |
|
| 17054 |
for (uint32_t i = 0; i < n_leafs; ++i) {
|
| 17055 |
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
| 17056 |
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
|
|
|
| 17057 |
|
| 17058 |
int64_t ne[GGML_MAX_DIMS];
|
| 17059 |
size_t nb[GGML_MAX_DIMS];
|
|
|
|
| 17069 |
nb[j] = nb_cur;
|
| 17070 |
}
|
| 17071 |
|
| 17072 |
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
| 17073 |
|
| 17074 |
tensor->op = (enum ggml_op) op;
|
| 17075 |
|
|
|
|
| 17086 |
|
| 17087 |
ptr += ggml_nbytes(tensor);
|
| 17088 |
|
| 17089 |
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
| 17090 |
}
|
| 17091 |
}
|
| 17092 |
|
|
|
|
| 17096 |
{
|
| 17097 |
uint32_t type;
|
| 17098 |
uint32_t op;
|
|
|
|
| 17099 |
|
| 17100 |
for (uint32_t i = 0; i < n_nodes; ++i) {
|
| 17101 |
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
| 17102 |
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
|
|
|
| 17103 |
|
| 17104 |
enum ggml_op eop = (enum ggml_op) op;
|
| 17105 |
|
|
|
|
| 17170 |
} break;
|
| 17171 |
default:
|
| 17172 |
{
|
| 17173 |
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
| 17174 |
|
| 17175 |
tensor->op = eop;
|
| 17176 |
} break;
|
|
|
|
| 17189 |
|
| 17190 |
result->nodes[i] = tensor;
|
| 17191 |
|
| 17192 |
+
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
| 17193 |
}
|
| 17194 |
}
|
| 17195 |
}
|
|
|
|
| 17327 |
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
| 17328 |
}
|
| 17329 |
|
| 17330 |
+
if (ggml_is_matrix(node)) {
|
| 17331 |
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
| 17332 |
} else {
|
| 17333 |
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
|
|
|
| 17594 |
int64_t i = 0;
|
| 17595 |
for (int p = 0; p < np; ++p) {
|
| 17596 |
const int64_t ne = ggml_nelements(ps[p]);
|
| 17597 |
+
const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
|
| 17598 |
for (int64_t j = 0; j < ne; ++j) {
|
| 17599 |
float x = ggml_get_f32_1d(ps[p], j);
|
| 17600 |
float g_ = g[i]*gnorm;
|
|
|
|
| 18868 |
return NULL;
|
| 18869 |
}
|
| 18870 |
|
| 18871 |
+
const size_t size_cur = ggml_row_size(info->type, ne);
|
| 18872 |
|
| 18873 |
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
| 18874 |
}
|
|
|
|
| 19197 |
return ctx->infos[i].name.data;
|
| 19198 |
}
|
| 19199 |
|
| 19200 |
+
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
|
| 19201 |
+
return ctx->infos[i].type;
|
| 19202 |
+
}
|
| 19203 |
+
|
| 19204 |
// returns the index
|
| 19205 |
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
| 19206 |
const int idx = gguf_find_key(ctx, key);
|
|
|
|
| 19376 |
ctx->infos[idx].ne[i] = 1;
|
| 19377 |
}
|
| 19378 |
|
| 19379 |
+
ctx->infos[idx].n_dims = ggml_n_dims(tensor);
|
| 19380 |
+
for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
|
| 19381 |
ctx->infos[idx].ne[i] = tensor->ne[i];
|
| 19382 |
}
|
| 19383 |
|
ggml.h
CHANGED
|
@@ -303,7 +303,7 @@ extern "C" {
|
|
| 303 |
|
| 304 |
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
| 305 |
typedef half ggml_fp16_t;
|
| 306 |
-
#elif defined(__ARM_NEON)
|
| 307 |
typedef __fp16 ggml_fp16_t;
|
| 308 |
#else
|
| 309 |
typedef uint16_t ggml_fp16_t;
|
|
@@ -343,6 +343,12 @@ extern "C" {
|
|
| 343 |
GGML_TYPE_COUNT,
|
| 344 |
};
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
enum ggml_backend_type {
|
| 347 |
GGML_BACKEND_CPU = 0,
|
| 348 |
GGML_BACKEND_GPU = 10,
|
|
@@ -478,7 +484,8 @@ extern "C" {
|
|
| 478 |
enum ggml_log_level {
|
| 479 |
GGML_LOG_LEVEL_ERROR = 2,
|
| 480 |
GGML_LOG_LEVEL_WARN = 3,
|
| 481 |
-
GGML_LOG_LEVEL_INFO = 4
|
|
|
|
| 482 |
};
|
| 483 |
|
| 484 |
// ggml object
|
|
@@ -502,7 +509,6 @@ extern "C" {
|
|
| 502 |
|
| 503 |
struct ggml_backend_buffer * buffer;
|
| 504 |
|
| 505 |
-
int n_dims;
|
| 506 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
| 507 |
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
| 508 |
// nb[0] = ggml_type_size(type)
|
|
@@ -534,7 +540,7 @@ extern "C" {
|
|
| 534 |
|
| 535 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
| 536 |
|
| 537 |
-
char padding[
|
| 538 |
};
|
| 539 |
|
| 540 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
|
@@ -639,11 +645,14 @@ extern "C" {
|
|
| 639 |
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
| 640 |
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 641 |
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
| 642 |
-
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
| 643 |
|
| 644 |
-
GGML_API int
|
| 645 |
-
GGML_API size_t
|
| 646 |
-
GGML_API
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
|
| 648 |
GGML_API const char * ggml_type_name(enum ggml_type type);
|
| 649 |
GGML_API const char * ggml_op_name (enum ggml_op op);
|
|
@@ -662,6 +671,11 @@ extern "C" {
|
|
| 662 |
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
| 663 |
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
| 664 |
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
|
| 666 |
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 667 |
|
|
@@ -722,8 +736,8 @@ extern "C" {
|
|
| 722 |
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
| 723 |
|
| 724 |
// Context tensor enumeration and lookup
|
| 725 |
-
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
| 726 |
-
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
| 727 |
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
| 728 |
|
| 729 |
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
|
@@ -1050,6 +1064,12 @@ extern "C" {
|
|
| 1050 |
struct ggml_tensor * a,
|
| 1051 |
struct ggml_tensor * b);
|
| 1052 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1053 |
// indirect matrix multiplication
|
| 1054 |
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
| 1055 |
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
|
@@ -1075,13 +1095,13 @@ extern "C" {
|
|
| 1075 |
GGML_API struct ggml_tensor * ggml_scale(
|
| 1076 |
struct ggml_context * ctx,
|
| 1077 |
struct ggml_tensor * a,
|
| 1078 |
-
|
| 1079 |
|
| 1080 |
// in-place, returns view(a)
|
| 1081 |
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
| 1082 |
struct ggml_context * ctx,
|
| 1083 |
struct ggml_tensor * a,
|
| 1084 |
-
|
| 1085 |
|
| 1086 |
// b -> view(a,offset,nb1,nb2,3), return modified a
|
| 1087 |
GGML_API struct ggml_tensor * ggml_set(
|
|
@@ -2116,10 +2136,11 @@ extern "C" {
|
|
| 2116 |
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
| 2117 |
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
| 2118 |
|
| 2119 |
-
GGML_API int
|
| 2120 |
-
GGML_API int
|
| 2121 |
-
GGML_API size_t
|
| 2122 |
-
GGML_API char *
|
|
|
|
| 2123 |
|
| 2124 |
// overrides existing values or adds a new one
|
| 2125 |
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
|
|
|
| 303 |
|
| 304 |
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
| 305 |
typedef half ggml_fp16_t;
|
| 306 |
+
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
| 307 |
typedef __fp16 ggml_fp16_t;
|
| 308 |
#else
|
| 309 |
typedef uint16_t ggml_fp16_t;
|
|
|
|
| 343 |
GGML_TYPE_COUNT,
|
| 344 |
};
|
| 345 |
|
| 346 |
+
// precision
|
| 347 |
+
enum ggml_prec {
|
| 348 |
+
GGML_PREC_DEFAULT,
|
| 349 |
+
GGML_PREC_F32,
|
| 350 |
+
};
|
| 351 |
+
|
| 352 |
enum ggml_backend_type {
|
| 353 |
GGML_BACKEND_CPU = 0,
|
| 354 |
GGML_BACKEND_GPU = 10,
|
|
|
|
| 484 |
enum ggml_log_level {
|
| 485 |
GGML_LOG_LEVEL_ERROR = 2,
|
| 486 |
GGML_LOG_LEVEL_WARN = 3,
|
| 487 |
+
GGML_LOG_LEVEL_INFO = 4,
|
| 488 |
+
GGML_LOG_LEVEL_DEBUG = 5
|
| 489 |
};
|
| 490 |
|
| 491 |
// ggml object
|
|
|
|
| 509 |
|
| 510 |
struct ggml_backend_buffer * buffer;
|
| 511 |
|
|
|
|
| 512 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
| 513 |
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
| 514 |
// nb[0] = ggml_type_size(type)
|
|
|
|
| 540 |
|
| 541 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
| 542 |
|
| 543 |
+
char padding[8];
|
| 544 |
};
|
| 545 |
|
| 546 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
|
|
|
| 645 |
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
| 646 |
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 647 |
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
|
|
|
| 648 |
|
| 649 |
+
GGML_API int ggml_blck_size(enum ggml_type type);
|
| 650 |
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
| 651 |
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
| 652 |
+
|
| 653 |
+
GGML_DEPRECATED(
|
| 654 |
+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
| 655 |
+
"use ggml_row_size() instead");
|
| 656 |
|
| 657 |
GGML_API const char * ggml_type_name(enum ggml_type type);
|
| 658 |
GGML_API const char * ggml_op_name (enum ggml_op op);
|
|
|
|
| 671 |
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
| 672 |
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
| 673 |
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
| 674 |
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
| 675 |
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
| 676 |
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
| 677 |
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
| 678 |
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
| 679 |
|
| 680 |
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 681 |
|
|
|
|
| 736 |
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
| 737 |
|
| 738 |
// Context tensor enumeration and lookup
|
| 739 |
+
GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
|
| 740 |
+
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
| 741 |
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
| 742 |
|
| 743 |
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
|
|
|
| 1064 |
struct ggml_tensor * a,
|
| 1065 |
struct ggml_tensor * b);
|
| 1066 |
|
| 1067 |
+
// change the precision of a matrix multiplication
|
| 1068 |
+
// set to GGML_PREC_F32 for higher precision (useful for phi-2)
|
| 1069 |
+
GGML_API void ggml_mul_mat_set_prec(
|
| 1070 |
+
struct ggml_tensor * a,
|
| 1071 |
+
enum ggml_prec prec);
|
| 1072 |
+
|
| 1073 |
// indirect matrix multiplication
|
| 1074 |
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
| 1075 |
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
|
|
|
| 1095 |
GGML_API struct ggml_tensor * ggml_scale(
|
| 1096 |
struct ggml_context * ctx,
|
| 1097 |
struct ggml_tensor * a,
|
| 1098 |
+
float s);
|
| 1099 |
|
| 1100 |
// in-place, returns view(a)
|
| 1101 |
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
| 1102 |
struct ggml_context * ctx,
|
| 1103 |
struct ggml_tensor * a,
|
| 1104 |
+
float s);
|
| 1105 |
|
| 1106 |
// b -> view(a,offset,nb1,nb2,3), return modified a
|
| 1107 |
GGML_API struct ggml_tensor * ggml_set(
|
|
|
|
| 2136 |
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
| 2137 |
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
| 2138 |
|
| 2139 |
+
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
| 2140 |
+
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
| 2141 |
+
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
| 2142 |
+
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
| 2143 |
+
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
| 2144 |
|
| 2145 |
// overrides existing values or adds a new one
|
| 2146 |
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
whisper.cpp
CHANGED
|
@@ -487,8 +487,8 @@ static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
|
|
| 487 |
|
| 488 |
// measure the memory usage of a graph and prepare the allocr's internal data buffer
|
| 489 |
static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
|
| 490 |
-
auto & alloc
|
| 491 |
-
auto & meta
|
| 492 |
|
| 493 |
alloc = ggml_allocr_new_measure_from_backend(backend);
|
| 494 |
|
|
@@ -1777,7 +1777,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|
| 1777 |
|
| 1778 |
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
| 1779 |
|
| 1780 |
-
ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
| 1781 |
|
| 1782 |
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
|
| 1783 |
//ggml_allocr_alloc(alloc, cur);
|
|
@@ -1787,13 +1787,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|
| 1787 |
//}
|
| 1788 |
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
|
| 1789 |
|
| 1790 |
-
|
| 1791 |
-
ggml_allocr_alloc(alloc, KQscale);
|
| 1792 |
-
|
| 1793 |
-
if (!ggml_allocr_is_measure(alloc)) {
|
| 1794 |
-
const float val = 1.0f/sqrtf(float(n_state)/n_head);
|
| 1795 |
-
ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
| 1796 |
-
}
|
| 1797 |
|
| 1798 |
// ===================================================================
|
| 1799 |
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
|
@@ -1843,14 +1837,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|
| 1843 |
|
| 1844 |
Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
|
| 1845 |
|
| 1846 |
-
//Qcur = ggml_scale(ctx0, Qcur,
|
| 1847 |
|
| 1848 |
// note: no bias for Key
|
| 1849 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
| 1850 |
layer.attn_k_w,
|
| 1851 |
cur);
|
| 1852 |
|
| 1853 |
-
//Kcur = ggml_scale(ctx0, Kcur,
|
| 1854 |
|
| 1855 |
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
| 1856 |
layer.attn_v_w,
|
|
@@ -2032,7 +2026,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
|
|
| 2032 |
|
| 2033 |
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
| 2034 |
|
| 2035 |
-
ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
| 2036 |
|
| 2037 |
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
| 2038 |
//ggml_allocr_alloc(alloc, cur);
|
|
@@ -2042,13 +2036,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
|
|
| 2042 |
//}
|
| 2043 |
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
|
| 2044 |
|
| 2045 |
-
|
| 2046 |
-
ggml_allocr_alloc(alloc, Kscale);
|
| 2047 |
-
|
| 2048 |
-
if (!ggml_allocr_is_measure(alloc)) {
|
| 2049 |
-
const float val = pow(float(n_state) / n_head, -0.25);
|
| 2050 |
-
ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float));
|
| 2051 |
-
}
|
| 2052 |
|
| 2053 |
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
| 2054 |
auto & layer = model.layers_decoder[il];
|
|
@@ -2207,13 +2195,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|
| 2207 |
}
|
| 2208 |
}
|
| 2209 |
|
| 2210 |
-
|
| 2211 |
-
ggml_allocr_alloc(alloc, KQscale);
|
| 2212 |
-
|
| 2213 |
-
if (!ggml_allocr_is_measure(alloc)) {
|
| 2214 |
-
const float val = pow(float(n_state)/n_head, -0.25);
|
| 2215 |
-
ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
| 2216 |
-
}
|
| 2217 |
|
| 2218 |
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
| 2219 |
ggml_allocr_alloc(alloc, KQ_mask);
|
|
@@ -6128,7 +6110,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6128 |
|
| 6129 |
// multi-thread
|
| 6130 |
|
| 6131 |
-
for (
|
| 6132 |
char * src = (char *) malloc(size);
|
| 6133 |
char * dst = (char *) malloc(size);
|
| 6134 |
|
|
@@ -6152,13 +6134,13 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6152 |
const int64_t t0 = ggml_time_us();
|
| 6153 |
|
| 6154 |
std::vector<std::thread> threads(k - 1);
|
| 6155 |
-
for (
|
| 6156 |
threads[th] = std::thread(helper, th);
|
| 6157 |
}
|
| 6158 |
|
| 6159 |
helper(k - 1);
|
| 6160 |
|
| 6161 |
-
for (
|
| 6162 |
threads[th].join();
|
| 6163 |
}
|
| 6164 |
|
|
|
|
| 487 |
|
| 488 |
// measure the memory usage of a graph and prepare the allocr's internal data buffer
|
| 489 |
static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
|
| 490 |
+
auto & alloc = allocr.alloc;
|
| 491 |
+
auto & meta = allocr.meta;
|
| 492 |
|
| 493 |
alloc = ggml_allocr_new_measure_from_backend(backend);
|
| 494 |
|
|
|
|
| 1777 |
|
| 1778 |
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
| 1779 |
|
| 1780 |
+
//ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
| 1781 |
|
| 1782 |
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
|
| 1783 |
//ggml_allocr_alloc(alloc, cur);
|
|
|
|
| 1787 |
//}
|
| 1788 |
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
|
| 1789 |
|
| 1790 |
+
const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1791 |
|
| 1792 |
// ===================================================================
|
| 1793 |
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
|
|
|
| 1837 |
|
| 1838 |
Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
|
| 1839 |
|
| 1840 |
+
//Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));
|
| 1841 |
|
| 1842 |
// note: no bias for Key
|
| 1843 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
| 1844 |
layer.attn_k_w,
|
| 1845 |
cur);
|
| 1846 |
|
| 1847 |
+
//Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));
|
| 1848 |
|
| 1849 |
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
| 1850 |
layer.attn_v_w,
|
|
|
|
| 2026 |
|
| 2027 |
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
| 2028 |
|
| 2029 |
+
//ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
| 2030 |
|
| 2031 |
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
| 2032 |
//ggml_allocr_alloc(alloc, cur);
|
|
|
|
| 2036 |
//}
|
| 2037 |
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
|
| 2038 |
|
| 2039 |
+
const float Kscale = pow(float(n_state) / n_head, -0.25);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2040 |
|
| 2041 |
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
| 2042 |
auto & layer = model.layers_decoder[il];
|
|
|
|
| 2195 |
}
|
| 2196 |
}
|
| 2197 |
|
| 2198 |
+
const float KQscale = pow(float(n_state)/n_head, -0.25);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2199 |
|
| 2200 |
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
| 2201 |
ggml_allocr_alloc(alloc, KQ_mask);
|
|
|
|
| 6110 |
|
| 6111 |
// multi-thread
|
| 6112 |
|
| 6113 |
+
for (int32_t k = 1; k <= n_threads; k++) {
|
| 6114 |
char * src = (char *) malloc(size);
|
| 6115 |
char * dst = (char *) malloc(size);
|
| 6116 |
|
|
|
|
| 6134 |
const int64_t t0 = ggml_time_us();
|
| 6135 |
|
| 6136 |
std::vector<std::thread> threads(k - 1);
|
| 6137 |
+
for (int32_t th = 0; th < k - 1; ++th) {
|
| 6138 |
threads[th] = std::thread(helper, th);
|
| 6139 |
}
|
| 6140 |
|
| 6141 |
helper(k - 1);
|
| 6142 |
|
| 6143 |
+
for (int32_t th = 0; th < k - 1; ++th) {
|
| 6144 |
threads[th].join();
|
| 6145 |
}
|
| 6146 |
|