ggerganov commited on
Commit
aa86ade
·
unverified ·
1 Parent(s): 7163150

sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

Browse files

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings

examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -39,10 +39,11 @@
39
 
40
  #define LLAMA_MAX_RNG_STATE (64*1024)
41
 
 
42
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
43
 
44
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
45
- #define LLAMA_SESSION_VERSION 2
46
 
47
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
48
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -126,7 +127,7 @@ extern "C" {
126
  bool sorted;
127
  } llama_token_data_array;
128
 
129
- typedef void (*llama_progress_callback)(float progress, void *ctx);
130
 
131
  // Input data for llama_decode
132
  // A llama_batch object can contain input about one or many sequences
@@ -158,16 +159,38 @@ extern "C" {
158
  llama_seq_id all_seq_id; // used if seq_id == NULL
159
  } llama_batch;
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  struct llama_model_params {
162
  int32_t n_gpu_layers; // number of layers to store in VRAM
163
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
164
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
165
 
166
- // called with a progress value between 0 and 1, pass NULL to disable
 
 
167
  llama_progress_callback progress_callback;
 
168
  // context pointer passed to the progress callback
169
  void * progress_callback_user_data;
170
 
 
 
 
171
  // Keep the booleans together to avoid misalignment during copy-by-value.
172
  bool vocab_only; // only load the vocabulary, no weights
173
  bool use_mmap; // use mmap if possible
@@ -185,17 +208,20 @@ extern "C" {
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
186
  float rope_freq_base; // RoPE base frequency, 0 = from model
187
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
188
- float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
189
  float yarn_attn_factor; // YaRN magnitude scaling factor
190
  float yarn_beta_fast; // YaRN low correction dim
191
  float yarn_beta_slow; // YaRN high correction dim
192
  uint32_t yarn_orig_ctx; // YaRN original context size
193
 
 
 
 
194
  // Keep the booleans together to avoid misalignment during copy-by-value.
195
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
196
- bool f16_kv; // use fp16 for KV cache, fp32 otherwise
197
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
198
- bool embedding; // embedding mode only
199
  };
200
 
201
  // model quantization parameters
@@ -290,7 +316,9 @@ extern "C" {
290
 
291
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
292
 
293
- LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
 
 
294
 
295
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
296
 
@@ -301,6 +329,23 @@ extern "C" {
301
  // Get the model's RoPE frequency scaling factor
302
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  // Get a string describing the model type
305
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
306
 
@@ -344,9 +389,60 @@ extern "C" {
344
  // KV cache
345
  //
346
 
347
- // Returns the number of tokens in the KV cache
348
- LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
349
- "avoid using this, it will be removed in the future, instead - count the tokens in user code");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  // Clear the KV cache
352
  LLAMA_API void llama_kv_cache_clear(
@@ -517,6 +613,12 @@ extern "C" {
517
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
518
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
519
 
 
 
 
 
 
 
520
  // codellama infill tokens
521
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
522
  LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
 
39
 
40
  #define LLAMA_MAX_RNG_STATE (64*1024)
41
 
42
+ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
43
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
44
 
45
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
46
+ #define LLAMA_SESSION_VERSION 3
47
 
48
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
49
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 
127
  bool sorted;
128
  } llama_token_data_array;
129
 
130
+ typedef bool (*llama_progress_callback)(float progress, void *ctx);
131
 
132
  // Input data for llama_decode
133
  // A llama_batch object can contain input about one or many sequences
 
159
  llama_seq_id all_seq_id; // used if seq_id == NULL
160
  } llama_batch;
161
 
162
+ enum llama_model_kv_override_type {
163
+ LLAMA_KV_OVERRIDE_INT,
164
+ LLAMA_KV_OVERRIDE_FLOAT,
165
+ LLAMA_KV_OVERRIDE_BOOL,
166
+ };
167
+
168
+ struct llama_model_kv_override {
169
+ char key[128];
170
+ enum llama_model_kv_override_type tag;
171
+ union {
172
+ int64_t int_value;
173
+ double float_value;
174
+ bool bool_value;
175
+ };
176
+ };
177
+
178
  struct llama_model_params {
179
  int32_t n_gpu_layers; // number of layers to store in VRAM
180
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
181
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
182
 
183
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
184
+ // If the provided progress_callback returns true, model loading continues.
185
+ // If it returns false, model loading is immediately aborted.
186
  llama_progress_callback progress_callback;
187
+
188
  // context pointer passed to the progress callback
189
  void * progress_callback_user_data;
190
 
191
+ // override key-value pairs of the model meta data
192
+ const struct llama_model_kv_override * kv_overrides;
193
+
194
  // Keep the booleans together to avoid misalignment during copy-by-value.
195
  bool vocab_only; // only load the vocabulary, no weights
196
  bool use_mmap; // use mmap if possible
 
208
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
209
  float rope_freq_base; // RoPE base frequency, 0 = from model
210
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
211
+ float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
212
  float yarn_attn_factor; // YaRN magnitude scaling factor
213
  float yarn_beta_fast; // YaRN low correction dim
214
  float yarn_beta_slow; // YaRN high correction dim
215
  uint32_t yarn_orig_ctx; // YaRN original context size
216
 
217
+ enum ggml_type type_k; // data type for K cache
218
+ enum ggml_type type_v; // data type for V cache
219
+
220
  // Keep the booleans together to avoid misalignment during copy-by-value.
221
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
222
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
223
+ bool embedding; // embedding mode only
224
+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
225
  };
226
 
227
  // model quantization parameters
 
316
 
317
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
318
 
319
+ // TODO: become more consistent with returned int types across the API
320
+ LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
321
+ LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
322
 
323
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
324
 
 
329
  // Get the model's RoPE frequency scaling factor
330
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
331
 
332
+ // Functions to access the model's GGUF metadata scalar values
333
+ // - The functions return the length of the string on success, or -1 on failure
334
+ // - The output string is always null-terminated and cleared on failure
335
+ // - GGUF array values are not supported by these functions
336
+
337
+ // Get metadata value as a string by key name
338
+ LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
339
+
340
+ // Get the number of metadata key/value pairs
341
+ LLAMA_API int llama_model_meta_count(const struct llama_model * model);
342
+
343
+ // Get metadata key name by index
344
+ LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
345
+
346
+ // Get metadata value as a string by index
347
+ LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
348
+
349
  // Get a string describing the model type
350
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
351
 
 
389
  // KV cache
390
  //
391
 
392
+ // Information associated with an individual cell in the KV cache view.
393
+ struct llama_kv_cache_view_cell {
394
+ // The position for this cell. Takes KV cache shifts into account.
395
+ // May be negative if the cell is not populated.
396
+ llama_pos pos;
397
+ };
398
+
399
+ // An updateable view of the KV cache.
400
+ struct llama_kv_cache_view {
401
+ // Number of KV cache cells. This will be the same as the context size.
402
+ int32_t n_cells;
403
+
404
+ // Maximum number of sequences that can exist in a cell. It's not an error
405
+ // if there are more sequences in a cell than this value, however they will
406
+ // not be visible in the view cells_sequences.
407
+ int32_t n_max_seq;
408
+
409
+ // Number of tokens in the cache. For example, if there are two populated
410
+ // cells, the first with 1 sequence id in it and the second with 2 sequence
411
+ // ids then you'll have 3 tokens.
412
+ int32_t token_count;
413
+
414
+ // Number of populated cache cells.
415
+ int32_t used_cells;
416
+
417
+ // Maximum contiguous empty slots in the cache.
418
+ int32_t max_contiguous;
419
+
420
+ // Index to the start of the max_contiguous slot range. Can be negative
421
+ // when cache is full.
422
+ int32_t max_contiguous_idx;
423
+
424
+ // Information for an individual cell.
425
+ struct llama_kv_cache_view_cell * cells;
426
+
427
+ // The sequences for each cell. There will be n_max_seq items per cell.
428
+ llama_seq_id * cells_sequences;
429
+ };
430
+
431
+ // Create an empty KV cache view. (use only for debugging purposes)
432
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
433
+
434
+ // Free a KV cache view. (use only for debugging purposes)
435
+ LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
436
+
437
+ // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
438
+ LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
439
+
440
+ // Returns the number of tokens in the KV cache (slow, use only for debug)
441
+ // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
442
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
443
+
444
+ // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
445
+ LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
446
 
447
  // Clear the KV cache
448
  LLAMA_API void llama_kv_cache_clear(
 
613
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
614
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
615
 
616
+ // Returns -1 if unknown, 1 for true or 0 for false.
617
+ LLAMA_API int llama_add_bos_token(const struct llama_model * model);
618
+
619
+ // Returns -1 if unknown, 1 for true or 0 for false.
620
+ LLAMA_API int llama_add_eos_token(const struct llama_model * model);
621
+
622
  // codellama infill tokens
623
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
624
  LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
examples/talk-llama/talk-llama.cpp CHANGED
@@ -282,7 +282,6 @@ int main(int argc, char ** argv) {
282
  // tune these to your liking
283
  lcparams.n_ctx = 2048;
284
  lcparams.seed = 1;
285
- lcparams.f16_kv = true;
286
  lcparams.n_threads = params.n_threads;
287
 
288
  struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
 
282
  // tune these to your liking
283
  lcparams.n_ctx = 2048;
284
  lcparams.seed = 1;
 
285
  lcparams.n_threads = params.n_threads;
286
 
287
  struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
examples/talk.wasm/gpt-2.cpp CHANGED
@@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
155
  const int n_ctx = hparams.n_ctx;
156
  const int n_vocab = hparams.n_vocab;
157
 
158
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
159
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
160
 
161
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
162
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
163
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
164
 
165
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
166
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
167
 
168
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
169
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
170
 
171
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
172
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
173
 
174
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
175
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
176
 
177
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
178
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
179
 
180
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
181
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
182
 
183
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
184
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
185
 
186
  ctx_size += (6 + 12*n_layer)*256; // object overhead
187
 
@@ -524,8 +524,7 @@ bool gpt2_eval(
524
  struct ggml_tensor * KQ_scaled =
525
  ggml_scale(ctx0,
526
  KQ,
527
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
528
- );
529
 
530
  // KQ_masked = mask_past(KQ_scaled)
531
  // [n_past + N, N, 12]
 
155
  const int n_ctx = hparams.n_ctx;
156
  const int n_vocab = hparams.n_vocab;
157
 
158
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
159
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
160
 
161
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
162
+ ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
163
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
164
 
165
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
166
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
167
 
168
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
169
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
170
 
171
+ ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
172
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
173
 
174
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
175
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
176
 
177
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
178
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
179
 
180
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
181
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
182
 
183
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
184
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
185
 
186
  ctx_size += (6 + 12*n_layer)*256; // object overhead
187
 
 
524
  struct ggml_tensor * KQ_scaled =
525
  ggml_scale(ctx0,
526
  KQ,
527
+ 1.0f/sqrt(float(n_embd)/n_head));
 
528
 
529
  // KQ_masked = mask_past(KQ_scaled)
530
  // [n_past + N, N, 12]
examples/talk/gpt-2.cpp CHANGED
@@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
155
  const int n_ctx = hparams.n_ctx;
156
  const int n_vocab = hparams.n_vocab;
157
 
158
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
159
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
160
 
161
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
162
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
163
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
164
 
165
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
166
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
167
 
168
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
169
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
170
 
171
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
172
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
173
 
174
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
175
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
176
 
177
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
178
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
179
 
180
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
181
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
182
 
183
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
184
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
185
 
186
  ctx_size += (6 + 12*n_layer)*256; // object overhead
187
 
@@ -525,8 +525,7 @@ bool gpt2_eval(
525
  struct ggml_tensor * KQ_scaled =
526
  ggml_scale(ctx0,
527
  KQ,
528
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
529
- );
530
 
531
  // KQ_masked = mask_past(KQ_scaled)
532
  // [n_past + N, N, 12]
 
155
  const int n_ctx = hparams.n_ctx;
156
  const int n_vocab = hparams.n_vocab;
157
 
158
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
159
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
160
 
161
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
162
+ ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
163
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
164
 
165
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
166
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
167
 
168
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
169
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
170
 
171
+ ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
172
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
173
 
174
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
175
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
176
 
177
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
178
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
179
 
180
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
181
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
182
 
183
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
184
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
185
 
186
  ctx_size += (6 + 12*n_layer)*256; // object overhead
187
 
 
525
  struct ggml_tensor * KQ_scaled =
526
  ggml_scale(ctx0,
527
  KQ,
528
+ 1.0f/sqrt(float(n_embd)/n_head));
 
529
 
530
  // KQ_masked = mask_past(KQ_scaled)
531
  // [n_past + N, N, 12]
extra/sync-llama.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
4
+ cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
5
+ cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
ggml-alloc.c CHANGED
@@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t
72
 
73
  // check if a tensor is allocated by this buffer
74
  static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
75
- return tensor->buffer == alloc->buffer;
76
  }
77
 
78
  static bool ggml_is_view(struct ggml_tensor * t) {
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
449
  if (update_backend) {
450
  view->backend = view->view_src->backend;
451
  }
452
- view->buffer = view->view_src->buffer;
 
453
  view->data = (char *)view->view_src->data + view->view_offs;
454
 
455
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
456
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
457
  assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458
 
459
  if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
736
  }
737
 
738
  void ggml_allocr_free(ggml_allocr_t alloc) {
 
 
 
 
739
  ggml_gallocr_free(alloc->galloc);
740
  ggml_tallocr_free(alloc->talloc);
741
  free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
775
  }
776
 
777
  if (nbytes == 0) {
778
- fprintf(stderr, "%s: no tensors to allocate\n", __func__);
779
  return NULL;
780
  }
781
 
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
789
  } else {
790
  ggml_backend_view_init(buffer, t);
791
  }
 
 
 
 
 
792
  }
793
  }
794
 
 
72
 
73
  // check if a tensor is allocated by this buffer
74
  static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
75
+ return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
76
  }
77
 
78
  static bool ggml_is_view(struct ggml_tensor * t) {
 
449
  if (update_backend) {
450
  view->backend = view->view_src->backend;
451
  }
452
+ // views are initialized in the alloc buffer rather than the view_src buffer
453
+ view->buffer = alloc->buffer;
454
  view->data = (char *)view->view_src->data + view->view_offs;
455
 
 
 
456
  assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
457
 
458
  if (!alloc->measure) {
 
735
  }
736
 
737
  void ggml_allocr_free(ggml_allocr_t alloc) {
738
+ if (alloc == NULL) {
739
+ return;
740
+ }
741
+
742
  ggml_gallocr_free(alloc->galloc);
743
  ggml_tallocr_free(alloc->talloc);
744
  free(alloc);
 
778
  }
779
 
780
  if (nbytes == 0) {
781
+ // all the tensors in the context are already allocated
782
  return NULL;
783
  }
784
 
 
792
  } else {
793
  ggml_backend_view_init(buffer, t);
794
  }
795
+ } else {
796
+ if (t->view_src != NULL) {
797
+ // view of a pre-allocated tensor
798
+ ggml_backend_view_init(buffer, t);
799
+ }
800
  }
801
  }
802
 
ggml-backend-impl.h CHANGED
@@ -20,6 +20,9 @@ extern "C" {
20
  size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
  size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
  bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
 
 
 
23
  };
24
 
25
  struct ggml_backend_buffer_type {
@@ -31,15 +34,16 @@ extern "C" {
31
  typedef void * ggml_backend_buffer_context_t;
32
 
33
  struct ggml_backend_buffer_i {
34
- void (*free_buffer)(ggml_backend_buffer_t buffer);
35
  //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
36
- void * (*get_base) (ggml_backend_buffer_t buffer);
37
- void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
40
  // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
41
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
 
43
  };
44
 
45
  struct ggml_backend_buffer {
@@ -78,7 +82,7 @@ extern "C" {
78
  void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
79
  void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80
 
81
- void (*synchronize) (ggml_backend_t backend);
82
 
83
  // compute graph with a plan
84
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
20
  size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
  size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
  bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23
+ // check if tensor data is in host memory
24
+ // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
26
  };
27
 
28
  struct ggml_backend_buffer_type {
 
34
  typedef void * ggml_backend_buffer_context_t;
35
 
36
  struct ggml_backend_buffer_i {
37
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
38
  //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
39
+ void * (*get_base) (ggml_backend_buffer_t buffer);
40
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
43
  // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
44
+ void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45
+ void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
47
  };
48
 
49
  struct ggml_backend_buffer {
 
82
  void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
83
  void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
84
 
85
+ void (*synchronize)(ggml_backend_t backend);
86
 
87
  // compute graph with a plan
88
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
ggml-backend.c CHANGED
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
35
  return buft->iface.supports_backend(buft, backend);
36
  }
37
 
 
 
 
 
 
 
 
38
  // backend buffer
39
 
40
  ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
94
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
95
  }
96
 
 
 
 
 
 
 
 
 
97
  ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
98
  return buffer->buft;
99
  }
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
378
 
379
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
380
  free(buffer->context);
381
- GGML_UNUSED(buffer);
382
  }
383
 
384
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
411
  GGML_UNUSED(buffer);
412
  }
413
 
 
 
 
 
414
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
415
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
416
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
419
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
420
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
421
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
 
422
  };
423
 
424
  // for buffers from ptr, free is not called
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
430
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
431
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
432
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
 
433
  };
434
 
435
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
455
  GGML_UNUSED(buft);
456
  }
457
 
 
 
 
 
 
 
458
  ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
459
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
460
  /* .iface = */ {
461
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
462
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
463
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
464
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
 
465
  },
466
  /* .context = */ NULL,
467
  };
468
 
469
- return &ggml_backend_buffer_type_cpu;
470
  }
471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  struct ggml_backend_cpu_context {
473
  int n_threads;
474
  void * work_data;
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
505
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
506
 
507
  cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
508
- cpu_plan->cgraph = *cgraph;
509
 
510
  if (cpu_plan->cplan.work_size > 0) {
511
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1180
  // utils
1181
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1182
  GGML_ASSERT(tensor->buffer == NULL);
1183
- GGML_ASSERT(tensor->data == NULL);
1184
  GGML_ASSERT(tensor->view_src != NULL);
1185
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1186
  GGML_ASSERT(tensor->view_src->data != NULL);
 
35
  return buft->iface.supports_backend(buft, backend);
36
  }
37
 
38
+ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
39
+ if (buft->iface.is_host) {
40
+ return buft->iface.is_host(buft);
41
+ }
42
+ return false;
43
+ }
44
+
45
  // backend buffer
46
 
47
  ggml_backend_buffer_t ggml_backend_buffer_init(
 
101
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
102
  }
103
 
104
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
105
+ buffer->iface.clear(buffer, value);
106
+ }
107
+
108
+ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
+ return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110
+ }
111
+
112
  ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
113
  return buffer->buft;
114
  }
 
393
 
394
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
395
  free(buffer->context);
 
396
  }
397
 
398
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
425
  GGML_UNUSED(buffer);
426
  }
427
 
428
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
429
+ memset(buffer->context, value, buffer->size);
430
+ }
431
+
432
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
433
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
434
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
 
437
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
438
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
439
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
440
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
441
  };
442
 
443
  // for buffers from ptr, free is not called
 
449
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
450
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
451
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
452
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
453
  };
454
 
455
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
475
  GGML_UNUSED(buft);
476
  }
477
 
478
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
479
+ return true;
480
+
481
+ GGML_UNUSED(buft);
482
+ }
483
+
484
  ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
485
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
486
  /* .iface = */ {
487
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
488
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
489
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
490
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
491
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
492
  },
493
  /* .context = */ NULL,
494
  };
495
 
496
+ return &ggml_backend_cpu_buffer_type;
497
  }
498
 
499
+ #ifdef GGML_USE_CPU_HBM
500
+
501
+ // buffer type HBM
502
+
503
+ #include <hbwmalloc.h>
504
+
505
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
506
+ hbw_free(buffer->context);
507
+ }
508
+
509
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
510
+ //void * ptr = hbw_malloc(size);
511
+ void * ptr;
512
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
513
+ if (result != 0) {
514
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
515
+ return NULL;
516
+ }
517
+
518
+ // FIXME: this is a hack to avoid having to implement a new buffer type
519
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
520
+ buffer->buft = buft;
521
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
522
+
523
+ return buffer;
524
+ }
525
+
526
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
527
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
528
+ /* .iface = */ {
529
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
530
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
531
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
532
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
533
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
534
+ },
535
+ /* .context = */ NULL,
536
+ };
537
+
538
+ return &ggml_backend_cpu_buffer_type_hbm;
539
+ }
540
+ #endif
541
+
542
  struct ggml_backend_cpu_context {
543
  int n_threads;
544
  void * work_data;
 
575
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
576
 
577
  cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
578
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
579
 
580
  if (cpu_plan->cplan.work_size > 0) {
581
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
 
1250
  // utils
1251
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1252
  GGML_ASSERT(tensor->buffer == NULL);
1253
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1254
  GGML_ASSERT(tensor->view_src != NULL);
1255
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1256
  GGML_ASSERT(tensor->view_src->data != NULL);
ggml-backend.h CHANGED
@@ -21,6 +21,7 @@ extern "C" {
21
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
22
  GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
23
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
 
24
 
25
  // buffer
26
  GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
@@ -29,6 +30,8 @@ extern "C" {
29
  GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
30
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
31
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
 
 
32
  GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
33
 
34
  //
@@ -76,6 +79,10 @@ extern "C" {
76
 
77
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
78
 
 
 
 
 
79
  //
80
  // Backend registry
81
  //
 
21
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
22
  GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
23
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
24
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
25
 
26
  // buffer
27
  GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
 
30
  GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
31
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
32
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
33
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
34
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
35
  GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
36
 
37
  //
 
79
 
80
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
81
 
82
+ #ifdef GGML_USE_CPU_HBM
83
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
84
+ #endif
85
+
86
  //
87
  // Backend registry
88
  //
ggml-cuda.cu CHANGED
@@ -31,6 +31,7 @@
31
  #define CUDA_R_16F HIPBLAS_R_16F
32
  #define CUDA_R_32F HIPBLAS_R_32F
33
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 
34
  #define cublasCreate hipblasCreate
35
  #define cublasGemmEx hipblasGemmEx
36
  #define cublasGemmBatchedEx hipblasGemmBatchedEx
@@ -40,6 +41,7 @@
40
  #define cublasSetStream hipblasSetStream
41
  #define cublasSgemm hipblasSgemm
42
  #define cublasStatus_t hipblasStatus_t
 
43
  #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
44
  #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
45
  #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -58,8 +60,13 @@
58
  #define cudaGetDeviceProperties hipGetDeviceProperties
59
  #define cudaGetErrorString hipGetErrorString
60
  #define cudaGetLastError hipGetLastError
 
 
 
 
61
  #define cudaMalloc hipMalloc
62
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 
63
  #define cudaMemcpy hipMemcpy
64
  #define cudaMemcpy2DAsync hipMemcpy2DAsync
65
  #define cudaMemcpyAsync hipMemcpyAsync
@@ -78,10 +85,18 @@
78
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
79
  #define cudaStream_t hipStream_t
80
  #define cudaSuccess hipSuccess
 
81
  #else
82
  #include <cuda_runtime.h>
83
  #include <cublas_v2.h>
84
  #include <cuda_fp16.h>
 
 
 
 
 
 
 
85
  #endif // defined(GGML_USE_HIPBLAS)
86
 
87
  #include "ggml-cuda.h"
@@ -510,6 +525,14 @@ static size_t g_scratch_offset = 0;
510
 
511
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
512
 
 
 
 
 
 
 
 
 
513
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
514
  #pragma unroll
515
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -1970,8 +1993,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1970
  // second part effectively subtracts 8 from each quant value
1971
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1972
  #else
1973
- assert(false);
1974
- return 0.0f; // only to satisfy the compiler
1975
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1976
  }
1977
 
@@ -2008,8 +2030,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2008
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2009
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2010
  #else
2011
- assert(false);
2012
- return 0.0f; // only to satisfy the compiler
2013
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2014
  }
2015
 
@@ -2044,8 +2065,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2044
  // second part effectively subtracts 16 from each quant value
2045
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
2046
  #else
2047
- assert(false);
2048
- return 0.0f; // only to satisfy the compiler
2049
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2050
  }
2051
 
@@ -2090,8 +2110,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2090
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2091
 
2092
  #else
2093
- assert(false);
2094
- return 0.0f; // only to satisfy the compiler
2095
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2096
  }
2097
 
@@ -2112,8 +2131,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2112
 
2113
  return d8_0*d8_1 * sumi;
2114
  #else
2115
- assert(false);
2116
- return 0.0f; // only to satisfy the compiler
2117
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2118
  }
2119
 
@@ -2143,8 +2161,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2143
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2144
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2145
  #else
2146
- assert(false);
2147
- return 0.0f; // only to satisfy the compiler
2148
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2149
  }
2150
 
@@ -2179,8 +2196,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2179
 
2180
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
2181
  #else
2182
- assert(false);
2183
- return 0.0f; // only to satisfy the compiler
2184
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2185
  }
2186
 
@@ -2217,8 +2233,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2217
 
2218
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2219
  #else
2220
- assert(false);
2221
- return 0.0f; // only to satisfy the compiler
2222
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2223
  }
2224
 
@@ -2258,8 +2273,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2258
 
2259
  return d3 * sumf;
2260
  #else
2261
- assert(false);
2262
- return 0.0f; // only to satisfy the compiler
2263
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2264
  }
2265
 
@@ -2284,8 +2298,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2284
 
2285
  return d3*d8 * sumi;
2286
  #else
2287
- assert(false);
2288
- return 0.0f; // only to satisfy the compiler
2289
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2290
  }
2291
 
@@ -2318,8 +2331,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2318
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2319
 
2320
  #else
2321
- assert(false);
2322
- return 0.0f; // only to satisfy the compiler
2323
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2324
  }
2325
 
@@ -2352,8 +2364,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2352
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2353
 
2354
  #else
2355
- assert(false);
2356
- return 0.0f; // only to satisfy the compiler
2357
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2358
  }
2359
 
@@ -2393,8 +2404,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2393
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2394
 
2395
  #else
2396
- assert(false);
2397
- return 0.0f; // only to satisfy the compiler
2398
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2399
  }
2400
 
@@ -2427,8 +2437,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2427
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2428
 
2429
  #else
2430
- assert(false);
2431
- return 0.0f; // only to satisfy the compiler
2432
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2433
  }
2434
 
@@ -2458,8 +2467,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2458
 
2459
  return d*sumf;
2460
  #else
2461
- assert(false);
2462
- return 0.0f; // only to satisfy the compiler
2463
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2464
  }
2465
 
@@ -2490,8 +2498,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2490
  return d6 * sumf_d;
2491
 
2492
  #else
2493
- assert(false);
2494
- return 0.0f; // only to satisfy the compiler
2495
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2496
  }
2497
 
@@ -3357,8 +3364,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3357
  return dall * sumf_d - dmin * sumf_m;
3358
 
3359
  #else
3360
- assert(false);
3361
- return 0.0f; // only to satisfy the compiler
3362
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3363
 
3364
  #endif
@@ -3541,8 +3547,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3541
  return d * sumf_d;
3542
 
3543
  #else
3544
- assert(false);
3545
- return 0.0f; // only to satisfy the compiler
3546
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3547
 
3548
  #endif
@@ -3952,7 +3957,7 @@ template <bool need_check> static __global__ void
3952
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
  #else
3954
  (void) vec_dot_q4_0_q8_1_mul_mat;
3955
- assert(false);
3956
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3957
  }
3958
 
@@ -4021,7 +4026,7 @@ template <bool need_check> static __global__ void
4021
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
  #else
4023
  (void) vec_dot_q4_1_q8_1_mul_mat;
4024
- assert(false);
4025
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4026
  }
4027
 
@@ -4088,7 +4093,7 @@ template <bool need_check> static __global__ void
4088
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4089
  #else
4090
  (void) vec_dot_q5_0_q8_1_mul_mat;
4091
- assert(false);
4092
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4093
  }
4094
 
@@ -4155,7 +4160,7 @@ mul_mat_q5_1(
4155
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4156
  #else
4157
  (void) vec_dot_q5_1_q8_1_mul_mat;
4158
- assert(false);
4159
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4160
  }
4161
 
@@ -4222,7 +4227,7 @@ template <bool need_check> static __global__ void
4222
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4223
  #else
4224
  (void) vec_dot_q8_0_q8_1_mul_mat;
4225
- assert(false);
4226
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4227
  }
4228
 
@@ -4289,7 +4294,7 @@ mul_mat_q2_K(
4289
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4290
  #else
4291
  (void) vec_dot_q2_K_q8_1_mul_mat;
4292
- assert(false);
4293
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4294
  }
4295
 
@@ -4358,7 +4363,7 @@ template <bool need_check> static __global__ void
4358
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4359
  #else
4360
  (void) vec_dot_q3_K_q8_1_mul_mat;
4361
- assert(false);
4362
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4363
  }
4364
 
@@ -4427,7 +4432,7 @@ template <bool need_check> static __global__ void
4427
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4428
  #else
4429
  (void) vec_dot_q4_K_q8_1_mul_mat;
4430
- assert(false);
4431
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4432
  }
4433
 
@@ -4494,7 +4499,7 @@ mul_mat_q5_K(
4494
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4495
  #else
4496
  (void) vec_dot_q5_K_q8_1_mul_mat;
4497
- assert(false);
4498
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4499
  }
4500
 
@@ -4563,7 +4568,7 @@ template <bool need_check> static __global__ void
4563
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4564
  #else
4565
  (void) vec_dot_q6_K_q8_1_mul_mat;
4566
- assert(false);
4567
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4568
  }
4569
 
@@ -4998,7 +5003,16 @@ static __global__ void rope_neox(
4998
  const int ib = col / n_dims;
4999
  const int ic = col % n_dims;
5000
 
5001
- const int i = row*ncols + ib*n_dims + ic/2;
 
 
 
 
 
 
 
 
 
5002
  const int i2 = row/p_delta_rows;
5003
 
5004
  float cur_rot = inv_ndims * ic - ib;
@@ -5259,17 +5273,17 @@ static __global__ void im2col_f32_f16(
5259
  const int ky = (i - kd) / OW;
5260
  const int ix = i % OW;
5261
 
5262
- const int iiw = ix * s0 + kx * d0 - p0;
5263
- const int iih = blockIdx.y * s1 + ky * d1 - p1;
5264
 
5265
- const int offset_dst =
5266
  (blockIdx.y * OW + ix) * CHW +
5267
  (blockIdx.z * (KW * KH) + ky * KW + kx);
5268
 
5269
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5270
  dst[offset_dst] = __float2half(0.0f);
5271
  } else {
5272
- const int offset_src = blockIdx.z * offset_delta;
5273
  dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
5274
  }
5275
  }
@@ -6814,6 +6828,7 @@ static void ggml_cuda_op_get_rows(
6814
  break;
6815
  default:
6816
  // TODO: k-quants
 
6817
  GGML_ASSERT(false);
6818
  break;
6819
  }
@@ -7057,6 +7072,7 @@ inline void ggml_cuda_op_upscale(
7057
 
7058
  (void) src1;
7059
  (void) dst;
 
7060
  }
7061
 
7062
  inline void ggml_cuda_op_pad(
@@ -7073,6 +7089,7 @@ inline void ggml_cuda_op_pad(
7073
 
7074
  (void) src1;
7075
  (void) dst;
 
7076
  }
7077
 
7078
  inline void ggml_cuda_op_rms_norm(
@@ -7376,7 +7393,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
7376
 
7377
  const int compute_capability = g_compute_capabilities[id];
7378
 
7379
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
7380
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7381
  half * src0_as_f16 = nullptr;
7382
  size_t src0_as = 0;
@@ -7690,17 +7707,10 @@ inline void ggml_cuda_op_scale(
7690
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7691
 
7692
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7693
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
7694
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
7695
 
7696
  float scale;
7697
- // HACK: support for ggml backend interface
7698
- if (src1->backend == GGML_BACKEND_CPU) {
7699
- scale = ((float *) src1->data)[0];
7700
- } else {
7701
- // TODO: pass pointer to kernel instead of copying to host
7702
- CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
7703
- }
7704
 
7705
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
7706
  CUDA_CHECK(cudaGetLastError());
@@ -7747,8 +7757,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7747
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
7748
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
7749
 
7750
- const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
7751
-
7752
  // dd = data device
7753
  float * src0_ddf = nullptr;
7754
  float * src1_ddf = nullptr;
@@ -7769,7 +7777,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7769
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7770
  }
7771
 
7772
- if (use_src1 && !src1_stays_on_host) {
7773
  if (src1_on_device) {
7774
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7775
  } else {
@@ -7817,6 +7825,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
7817
  }
7818
 
7819
  #ifdef NDEBUG
 
 
 
 
 
7820
  for (int id = 0; id < g_device_count; ++id) {
7821
  CUDA_CHECK(ggml_cuda_set_device(id));
7822
 
@@ -7868,8 +7881,6 @@ static void ggml_cuda_op_mul_mat(
7868
  const int nb2 = dst->nb[2];
7869
  const int nb3 = dst->nb[3];
7870
 
7871
- ggml_cuda_set_peer_access(ne11);
7872
-
7873
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
7874
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
7875
 
@@ -8300,27 +8311,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
8300
  }
8301
 
8302
  static __global__ void k_compute_batched_ptrs(
8303
- const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
8304
  const void ** ptrs_src, void ** ptrs_dst,
8305
- int ne12, int ne13,
8306
- int ne23,
8307
- int nb02, int nb03,
8308
- int nb12, int nb13,
8309
- int nb2, int nb3,
8310
- int r2, int r3) {
8311
- int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8312
- int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8313
 
8314
  if (i13 >= ne13 || i12 >= ne12) {
8315
  return;
8316
  }
8317
 
8318
- int i03 = i13 / r3;
8319
- int i02 = i12 / r2;
8320
 
8321
  ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8322
  ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
8323
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
8324
  }
8325
 
8326
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8376,7 +8387,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8376
  to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8377
 
8378
  size_t dst_as = 0;
8379
- half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8380
 
8381
  GGML_ASSERT(ne12 % ne02 == 0);
8382
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -8385,9 +8430,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8385
  const int64_t r2 = ne12/ne02;
8386
  const int64_t r3 = ne13/ne03;
8387
 
8388
- const half alpha_f16 = 1.0f;
8389
- const half beta_f16 = 0.0f;
8390
-
8391
  #if 0
8392
  // use cublasGemmEx
8393
  {
@@ -8397,12 +8439,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8397
  int i02 = i12 / r2;
8398
 
8399
  CUBLAS_CHECK(
8400
- cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8401
  ne01, ne11, ne10,
8402
- &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
8403
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
8404
- &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
8405
- CUBLAS_COMPUTE_16F,
8406
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8407
  }
8408
  }
@@ -8414,11 +8456,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8414
  CUBLAS_CHECK(
8415
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8416
  ne01, ne11, ne10,
8417
- &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8418
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8419
- &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
8420
  ne12*ne13,
8421
- CUBLAS_COMPUTE_16F,
8422
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8423
  } else {
8424
  // use cublasGemmBatchedEx
@@ -8435,24 +8477,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8435
 
8436
  dim3 block_dims(ne13, ne12);
8437
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8438
- src0_as_f16, src1_as_f16, dst_f16,
8439
  ptrs_src, ptrs_dst,
8440
  ne12, ne13,
8441
  ne23,
8442
  nb02, nb03,
8443
  nb12, nb13,
8444
- dst->nb[2], dst->nb[3],
8445
  r2, r3);
8446
  CUDA_CHECK(cudaGetLastError());
8447
 
8448
  CUBLAS_CHECK(
8449
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8450
  ne01, ne11, ne10,
8451
- &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8452
- (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8453
- &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8454
  ne23,
8455
- CUBLAS_COMPUTE_16F,
8456
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8457
 
8458
  if (ptrs_src_s != 0) {
@@ -8464,11 +8506,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8464
  }
8465
  #endif
8466
 
8467
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8468
- to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
 
 
 
 
8469
 
8470
  ggml_cuda_pool_free(src1_as_f16, src1_as);
8471
- ggml_cuda_pool_free(dst_f16, dst_as);
8472
  }
8473
 
8474
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8732,7 +8777,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8732
  // TODO: mmq/mmv support
8733
  #endif
8734
 
8735
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
 
8736
 
8737
  const struct ggml_tensor * ids = src0;
8738
  const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -8740,10 +8786,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8740
 
8741
  std::vector<char> ids_host(ggml_nbytes(ids));
8742
 
 
 
8743
  if (ids->backend == GGML_BACKEND_GPU) {
8744
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8745
- CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8746
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8747
  } else {
8748
  memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8749
  }
@@ -8757,37 +8805,110 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8757
  ggml_tensor src1_row = *src1;
8758
  ggml_tensor dst_row = *dst;
8759
 
8760
- src1_row.ne[1] = 1;
8761
- dst_row.ne[1] = 1;
8762
-
8763
- src1_row.nb[2] = src1_row.nb[1];
8764
- dst_row.nb[2] = dst_row.nb[1];
8765
-
8766
- src1_row.nb[3] = src1_row.nb[1];
8767
- dst_row.nb[3] = dst_row.nb[1];
8768
 
8769
  src1_row.extra = &src1_row_extra;
8770
  dst_row.extra = &dst_row_extra;
8771
 
 
 
 
 
8772
 
8773
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8774
- //int32_t row_id;
8775
- //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8776
- //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8777
 
8778
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
 
 
 
8779
 
8780
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
8781
 
8782
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8783
 
8784
- src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8785
- src1_row.data = (char *) src1->data + i01*src1->nb[1];
8786
 
8787
- dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8788
- dst_row.data = (char *) dst->data + i01*dst->nb[1];
8789
 
8790
- ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8791
  }
8792
  }
8793
 
@@ -8898,6 +9019,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
8898
  (void) dst;
8899
  }
8900
 
 
 
 
 
 
 
8901
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8902
  const int64_t nrows = ggml_nrows(tensor);
8903
 
@@ -8947,13 +9074,12 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8947
 
8948
  // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
8949
  if (ne0 % MATRIX_ROW_PADDING != 0) {
8950
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8951
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8952
  }
8953
 
8954
  char * buf;
8955
  CUDA_CHECK(cudaMalloc(&buf, size));
8956
- char * buf_host = (char*)data + offset_split;
8957
 
8958
  // set padding to 0 to avoid possible NaN values
8959
  if (size > original_size) {
@@ -8975,7 +9101,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8975
  }
8976
 
8977
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
8978
- if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
8979
  return;
8980
  }
8981
 
@@ -9098,11 +9224,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
9098
 
9099
  ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
9100
 
9101
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
9102
- tensor->op == GGML_OP_VIEW;
9103
 
9104
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9105
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9106
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
9107
  size_t view_offset = 0;
9108
  if (tensor->op == GGML_OP_VIEW) {
@@ -9182,14 +9307,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9182
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
9183
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
9184
 
9185
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
9186
  return false;
9187
  }
9188
 
9189
  if (tensor->op == GGML_OP_MUL_MAT) {
9190
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
9191
  #ifndef NDEBUG
9192
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9193
  #endif
9194
  return false;
9195
  }
@@ -9318,6 +9443,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9318
  return false;
9319
  }
9320
 
 
 
 
 
9321
  if (params->ith != 0) {
9322
  return true;
9323
  }
@@ -9391,7 +9520,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
9391
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9392
 
9393
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
9394
- assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
9395
  tensor->backend = tensor->view_src->backend;
9396
  tensor->extra = tensor->view_src->extra;
9397
  return;
@@ -9422,23 +9551,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
9422
  }
9423
 
9424
  static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9425
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9426
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9427
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9428
 
9429
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9430
 
9431
- UNUSED(buffer);
 
 
 
9432
  }
9433
 
9434
  static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9435
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9436
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9437
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9438
 
 
 
 
 
 
9439
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
 
9440
 
9441
- UNUSED(buffer);
 
 
 
 
 
 
9442
  }
9443
 
9444
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@@ -9449,6 +9589,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
9449
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
9450
  /* .cpy_tensor_from = */ NULL,
9451
  /* .cpy_tensor_to = */ NULL,
 
9452
  };
9453
 
9454
  // cuda buffer type
@@ -9485,8 +9626,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
9485
 
9486
  if (ggml_is_quantized(tensor->type)) {
9487
  if (ne0 % MATRIX_ROW_PADDING != 0) {
9488
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
9489
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
9490
  }
9491
  }
9492
 
@@ -9501,35 +9641,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
9501
  UNUSED(buft);
9502
  }
9503
 
9504
- static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9505
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
9506
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
9507
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
9508
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
 
9509
  };
9510
 
9511
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9512
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
9513
- static bool ggml_backend_buffer_type_cuda_initialized = false;
9514
- if (!ggml_backend_buffer_type_cuda_initialized) {
 
 
9515
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9516
- ggml_backend_buffer_type_cuda[i] = {
9517
- /* .iface = */ cuda_backend_buffer_type_interface,
9518
  /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
9519
  };
9520
  }
9521
- ggml_backend_buffer_type_cuda_initialized = true;
9522
  }
9523
 
9524
- return &ggml_backend_buffer_type_cuda[device];
9525
  }
9526
 
9527
  // host buffer type
9528
 
9529
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9530
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9531
- CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
9532
- delete ctx;
9533
  }
9534
 
9535
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -9542,24 +9683,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
9542
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
9543
 
9544
  return buffer;
9545
-
9546
- UNUSED(buft);
9547
  }
9548
 
9549
- struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9550
- /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9551
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9552
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9553
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9554
- };
9555
-
9556
  ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9557
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9558
- /* .iface = */ cuda_backend_host_buffer_type_interface,
 
 
 
 
 
 
9559
  /* .context = */ nullptr,
9560
  };
9561
 
9562
- return &ggml_backend_buffer_type_cuda_host;
9563
  }
9564
 
9565
  // backend
@@ -9591,8 +9729,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
9591
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9592
 
9593
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9594
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9595
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9596
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9597
 
9598
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -9602,8 +9738,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
9602
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9603
 
9604
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9605
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9606
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9607
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9608
 
9609
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
 
31
  #define CUDA_R_16F HIPBLAS_R_16F
32
  #define CUDA_R_32F HIPBLAS_R_32F
33
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
34
+ #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
35
  #define cublasCreate hipblasCreate
36
  #define cublasGemmEx hipblasGemmEx
37
  #define cublasGemmBatchedEx hipblasGemmBatchedEx
 
41
  #define cublasSetStream hipblasSetStream
42
  #define cublasSgemm hipblasSgemm
43
  #define cublasStatus_t hipblasStatus_t
44
+ #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
45
  #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
46
  #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
47
  #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 
60
  #define cudaGetDeviceProperties hipGetDeviceProperties
61
  #define cudaGetErrorString hipGetErrorString
62
  #define cudaGetLastError hipGetLastError
63
+ #ifdef GGML_HIP_UMA
64
+ #define cudaMalloc hipMallocManaged
65
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
66
+ #else
67
  #define cudaMalloc hipMalloc
68
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
69
+ #endif
70
  #define cudaMemcpy hipMemcpy
71
  #define cudaMemcpy2DAsync hipMemcpy2DAsync
72
  #define cudaMemcpyAsync hipMemcpyAsync
 
85
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
86
  #define cudaStream_t hipStream_t
87
  #define cudaSuccess hipSuccess
88
+ #define __trap abort
89
  #else
90
  #include <cuda_runtime.h>
91
  #include <cublas_v2.h>
92
  #include <cuda_fp16.h>
93
+ // CUDA 10.2 does not have these macro definitions.
94
+ #ifndef CUBLAS_TF32_TENSOR_OP_MATH
95
+ #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
96
+ #define CUBLAS_COMPUTE_16F CUDA_R_16F
97
+ #define CUBLAS_COMPUTE_32F CUDA_R_32F
98
+ #define cublasComputeType_t cudaDataType_t
99
+ #endif
100
  #endif // defined(GGML_USE_HIPBLAS)
101
 
102
  #include "ggml-cuda.h"
 
525
 
526
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
527
 
528
+ [[noreturn]]
529
+ static __device__ void bad_arch() {
530
+ printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
531
+ __trap();
532
+
533
+ (void) bad_arch; // suppress unused function warning
534
+ }
535
+
536
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
537
  #pragma unroll
538
  for (int mask = 16; mask > 0; mask >>= 1) {
 
1993
  // second part effectively subtracts 8 from each quant value
1994
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1995
  #else
1996
+ bad_arch();
 
1997
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1998
  }
1999
 
 
2030
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2031
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2032
  #else
2033
+ bad_arch();
 
2034
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2035
  }
2036
 
 
2065
  // second part effectively subtracts 16 from each quant value
2066
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
2067
  #else
2068
+ bad_arch();
 
2069
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2070
  }
2071
 
 
2110
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2111
 
2112
  #else
2113
+ bad_arch();
 
2114
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2115
  }
2116
 
 
2131
 
2132
  return d8_0*d8_1 * sumi;
2133
  #else
2134
+ bad_arch();
 
2135
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2136
  }
2137
 
 
2161
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2162
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2163
  #else
2164
+ bad_arch();
 
2165
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2166
  }
2167
 
 
2196
 
2197
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
2198
  #else
2199
+ bad_arch();
 
2200
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2201
  }
2202
 
 
2233
 
2234
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2235
  #else
2236
+ bad_arch();
 
2237
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2238
  }
2239
 
 
2273
 
2274
  return d3 * sumf;
2275
  #else
2276
+ bad_arch();
 
2277
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2278
  }
2279
 
 
2298
 
2299
  return d3*d8 * sumi;
2300
  #else
2301
+ bad_arch();
 
2302
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2303
  }
2304
 
 
2331
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2332
 
2333
  #else
2334
+ bad_arch();
 
2335
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2336
  }
2337
 
 
2364
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2365
 
2366
  #else
2367
+ bad_arch();
 
2368
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2369
  }
2370
 
 
2404
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2405
 
2406
  #else
2407
+ bad_arch();
 
2408
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2409
  }
2410
 
 
2437
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2438
 
2439
  #else
2440
+ bad_arch();
 
2441
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2442
  }
2443
 
 
2467
 
2468
  return d*sumf;
2469
  #else
2470
+ bad_arch();
 
2471
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2472
  }
2473
 
 
2498
  return d6 * sumf_d;
2499
 
2500
  #else
2501
+ bad_arch();
 
2502
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2503
  }
2504
 
 
3364
  return dall * sumf_d - dmin * sumf_m;
3365
 
3366
  #else
3367
+ bad_arch();
 
3368
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3369
 
3370
  #endif
 
3547
  return d * sumf_d;
3548
 
3549
  #else
3550
+ bad_arch();
 
3551
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3552
 
3553
  #endif
 
3957
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3958
  #else
3959
  (void) vec_dot_q4_0_q8_1_mul_mat;
3960
+ bad_arch();
3961
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3962
  }
3963
 
 
4026
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4027
  #else
4028
  (void) vec_dot_q4_1_q8_1_mul_mat;
4029
+ bad_arch();
4030
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4031
  }
4032
 
 
4093
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4094
  #else
4095
  (void) vec_dot_q5_0_q8_1_mul_mat;
4096
+ bad_arch();
4097
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4098
  }
4099
 
 
4160
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4161
  #else
4162
  (void) vec_dot_q5_1_q8_1_mul_mat;
4163
+ bad_arch();
4164
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4165
  }
4166
 
 
4227
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4228
  #else
4229
  (void) vec_dot_q8_0_q8_1_mul_mat;
4230
+ bad_arch();
4231
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4232
  }
4233
 
 
4294
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4295
  #else
4296
  (void) vec_dot_q2_K_q8_1_mul_mat;
4297
+ bad_arch();
4298
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4299
  }
4300
 
 
4363
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4364
  #else
4365
  (void) vec_dot_q3_K_q8_1_mul_mat;
4366
+ bad_arch();
4367
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4368
  }
4369
 
 
4432
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4433
  #else
4434
  (void) vec_dot_q4_K_q8_1_mul_mat;
4435
+ bad_arch();
4436
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4437
  }
4438
 
 
4499
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4500
  #else
4501
  (void) vec_dot_q5_K_q8_1_mul_mat;
4502
+ bad_arch();
4503
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4504
  }
4505
 
 
4568
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4569
  #else
4570
  (void) vec_dot_q6_K_q8_1_mul_mat;
4571
+ bad_arch();
4572
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4573
  }
4574
 
 
5003
  const int ib = col / n_dims;
5004
  const int ic = col % n_dims;
5005
 
5006
+ if (ib > 0) {
5007
+ const int i = row*ncols + ib*n_dims + ic;
5008
+
5009
+ dst[i + 0] = x[i + 0];
5010
+ dst[i + 1] = x[i + 1];
5011
+
5012
+ return;
5013
+ }
5014
+
5015
+ const int i = row*ncols + ib*n_dims + ic/2;
5016
  const int i2 = row/p_delta_rows;
5017
 
5018
  float cur_rot = inv_ndims * ic - ib;
 
5273
  const int ky = (i - kd) / OW;
5274
  const int ix = i % OW;
5275
 
5276
+ const int64_t iiw = ix * s0 + kx * d0 - p0;
5277
+ const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
5278
 
5279
+ const int64_t offset_dst =
5280
  (blockIdx.y * OW + ix) * CHW +
5281
  (blockIdx.z * (KW * KH) + ky * KW + kx);
5282
 
5283
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5284
  dst[offset_dst] = __float2half(0.0f);
5285
  } else {
5286
+ const int64_t offset_src = blockIdx.z * offset_delta;
5287
  dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
5288
  }
5289
  }
 
6828
  break;
6829
  default:
6830
  // TODO: k-quants
6831
+ fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
6832
  GGML_ASSERT(false);
6833
  break;
6834
  }
 
7072
 
7073
  (void) src1;
7074
  (void) dst;
7075
+ (void) src1_dd;
7076
  }
7077
 
7078
  inline void ggml_cuda_op_pad(
 
7089
 
7090
  (void) src1;
7091
  (void) dst;
7092
+ (void) src1_dd;
7093
  }
7094
 
7095
  inline void ggml_cuda_op_rms_norm(
 
7393
 
7394
  const int compute_capability = g_compute_capabilities[id];
7395
 
7396
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
7397
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7398
  half * src0_as_f16 = nullptr;
7399
  size_t src0_as = 0;
 
7707
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7708
 
7709
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
7710
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
7711
 
7712
  float scale;
7713
+ memcpy(&scale, dst->op_params, sizeof(float));
 
 
 
 
 
 
7714
 
7715
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
7716
  CUDA_CHECK(cudaGetLastError());
 
7757
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
7758
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
7759
 
 
 
7760
  // dd = data device
7761
  float * src0_ddf = nullptr;
7762
  float * src1_ddf = nullptr;
 
7777
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7778
  }
7779
 
7780
+ if (use_src1) {
7781
  if (src1_on_device) {
7782
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7783
  } else {
 
7825
  }
7826
 
7827
  #ifdef NDEBUG
7828
+ for (int id = 0; id < g_device_count; ++id) {
7829
+ CUDA_CHECK(ggml_cuda_set_device(id));
7830
+ CUDA_CHECK(cudaDeviceSynchronize());
7831
+ }
7832
+
7833
  for (int id = 0; id < g_device_count; ++id) {
7834
  CUDA_CHECK(ggml_cuda_set_device(id));
7835
 
 
7881
  const int nb2 = dst->nb[2];
7882
  const int nb3 = dst->nb[3];
7883
 
 
 
7884
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
7885
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
7886
 
 
8311
  }
8312
 
8313
  static __global__ void k_compute_batched_ptrs(
8314
+ const half * src0_as_f16, const half * src1_as_f16, char * dst,
8315
  const void ** ptrs_src, void ** ptrs_dst,
8316
+ int64_t ne12, int64_t ne13,
8317
+ int64_t ne23,
8318
+ size_t nb02, size_t nb03,
8319
+ size_t nb12, size_t nb13,
8320
+ size_t nbd2, size_t nbd3,
8321
+ int64_t r2, int64_t r3) {
8322
+ int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
8323
+ int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
8324
 
8325
  if (i13 >= ne13 || i12 >= ne12) {
8326
  return;
8327
  }
8328
 
8329
+ int64_t i03 = i13 / r3;
8330
+ int64_t i02 = i12 / r2;
8331
 
8332
  ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8333
  ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
8334
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8335
  }
8336
 
8337
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
8387
  to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8388
 
8389
  size_t dst_as = 0;
8390
+
8391
+ half * dst_f16 = nullptr;
8392
+ char * dst_t = nullptr;
8393
+
8394
+ cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
8395
+ cudaDataType_t cu_data_type = CUDA_R_16F;
8396
+
8397
+ // dst strides
8398
+ size_t nbd2 = dst->nb[2];
8399
+ size_t nbd3 = dst->nb[3];
8400
+
8401
+ const half alpha_f16 = 1.0f;
8402
+ const half beta_f16 = 0.0f;
8403
+
8404
+ const float alpha_f32 = 1.0f;
8405
+ const float beta_f32 = 0.0f;
8406
+
8407
+ const void * alpha = &alpha_f16;
8408
+ const void * beta = &beta_f16;
8409
+
8410
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8411
+ dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8412
+ dst_t = (char *) dst_f16;
8413
+
8414
+ nbd2 /= sizeof(float) / sizeof(half);
8415
+ nbd3 /= sizeof(float) / sizeof(half);
8416
+ } else {
8417
+ dst_t = (char *) dst_ddf;
8418
+
8419
+ cu_compute_type = CUBLAS_COMPUTE_32F;
8420
+ cu_data_type = CUDA_R_32F;
8421
+
8422
+ alpha = &alpha_f32;
8423
+ beta = &beta_f32;
8424
+ }
8425
 
8426
  GGML_ASSERT(ne12 % ne02 == 0);
8427
  GGML_ASSERT(ne13 % ne03 == 0);
 
8430
  const int64_t r2 = ne12/ne02;
8431
  const int64_t r3 = ne13/ne03;
8432
 
 
 
 
8433
  #if 0
8434
  // use cublasGemmEx
8435
  {
 
8439
  int i02 = i12 / r2;
8440
 
8441
  CUBLAS_CHECK(
8442
+ cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8443
  ne01, ne11, ne10,
8444
+ alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
8445
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
8446
+ beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
8447
+ cu_compute_type,
8448
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8449
  }
8450
  }
 
8456
  CUBLAS_CHECK(
8457
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8458
  ne01, ne11, ne10,
8459
+ alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8460
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8461
+ beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8462
  ne12*ne13,
8463
+ cu_compute_type,
8464
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8465
  } else {
8466
  // use cublasGemmBatchedEx
 
8477
 
8478
  dim3 block_dims(ne13, ne12);
8479
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8480
+ src0_as_f16, src1_as_f16, dst_t,
8481
  ptrs_src, ptrs_dst,
8482
  ne12, ne13,
8483
  ne23,
8484
  nb02, nb03,
8485
  nb12, nb13,
8486
+ nbd2, nbd3,
8487
  r2, r3);
8488
  CUDA_CHECK(cudaGetLastError());
8489
 
8490
  CUBLAS_CHECK(
8491
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8492
  ne01, ne11, ne10,
8493
+ alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8494
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8495
+ beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
8496
  ne23,
8497
+ cu_compute_type,
8498
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8499
 
8500
  if (ptrs_src_s != 0) {
 
8506
  }
8507
  #endif
8508
 
8509
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8510
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8511
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8512
+
8513
+ ggml_cuda_pool_free(dst_f16, dst_as);
8514
+ }
8515
 
8516
  ggml_cuda_pool_free(src1_as_f16, src1_as);
 
8517
  }
8518
 
8519
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
8777
  // TODO: mmq/mmv support
8778
  #endif
8779
 
8780
+ const int64_t nb11 = src1->nb[1];
8781
+ const int64_t nb1 = dst->nb[1];
8782
 
8783
  const struct ggml_tensor * ids = src0;
8784
  const int32_t id = ((int32_t *) dst->op_params)[0];
 
8786
 
8787
  std::vector<char> ids_host(ggml_nbytes(ids));
8788
 
8789
+ const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8790
+
8791
  if (ids->backend == GGML_BACKEND_GPU) {
8792
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8793
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
8794
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8795
  } else {
8796
  memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8797
  }
 
8805
  ggml_tensor src1_row = *src1;
8806
  ggml_tensor dst_row = *dst;
8807
 
8808
+ src1_row.backend = GGML_BACKEND_GPU;
8809
+ dst_row.backend = GGML_BACKEND_GPU;
 
 
 
 
 
 
8810
 
8811
  src1_row.extra = &src1_row_extra;
8812
  dst_row.extra = &dst_row_extra;
8813
 
8814
+ char * src1_original = src1->backend == GGML_BACKEND_CPU ?
8815
+ (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
8816
+ char * dst_original = dst->backend == GGML_BACKEND_CPU ?
8817
+ (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
8818
 
8819
+ if (src1->ne[1] == 1) {
8820
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8821
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
 
8822
 
8823
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8824
+ //int32_t row_id;
8825
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8826
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8827
 
8828
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8829
 
8830
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8831
 
8832
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
 
8833
 
8834
+ src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
8835
+ src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
8836
 
8837
+ dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
8838
+ dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
8839
+
8840
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8841
+ }
8842
+ } else {
8843
+ size_t as_src1, as_dst;
8844
+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8845
+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
8846
+
8847
+ src1_row_extra.data_device[g_main_device] = src1_contiguous;
8848
+ dst_row_extra.data_device[g_main_device] = dst_contiguous;
8849
+
8850
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8851
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8852
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8853
+ cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
8854
+
8855
+ for (int32_t row_id = 0; row_id < n_as; ++row_id) {
8856
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8857
+
8858
+ int64_t num_src1_rows = 0;
8859
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8860
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8861
+
8862
+ if (row_id_i != row_id) {
8863
+ continue;
8864
+ }
8865
+
8866
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8867
+
8868
+ CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8869
+ nb11, src1_kind, stream));
8870
+ num_src1_rows++;
8871
+ }
8872
+
8873
+ if (num_src1_rows == 0) {
8874
+ continue;
8875
+ }
8876
+
8877
+ src1_row.ne[1] = num_src1_rows;
8878
+ dst_row.ne[1] = num_src1_rows;
8879
+
8880
+ src1_row.nb[1] = nb11;
8881
+ src1_row.nb[2] = num_src1_rows*nb11;
8882
+ src1_row.nb[3] = num_src1_rows*nb11;
8883
+
8884
+ dst_row.nb[1] = nb1;
8885
+ dst_row.nb[2] = num_src1_rows*nb1;
8886
+ dst_row.nb[3] = num_src1_rows*nb1;
8887
+
8888
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8889
+
8890
+ num_src1_rows = 0;
8891
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8892
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8893
+
8894
+ if (row_id_i != row_id) {
8895
+ continue;
8896
+ }
8897
+
8898
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8899
+
8900
+ CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8901
+ nb1, dst_kind, stream));
8902
+ num_src1_rows++;
8903
+ }
8904
+ }
8905
+
8906
+ ggml_cuda_pool_free(src1_contiguous, as_src1);
8907
+ ggml_cuda_pool_free(dst_contiguous, as_dst);
8908
+ }
8909
+
8910
+ if (dst->backend == GGML_BACKEND_CPU) {
8911
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8912
  }
8913
  }
8914
 
 
9019
  (void) dst;
9020
  }
9021
 
9022
+ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
9023
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
9024
+
9025
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
9026
+ }
9027
+
9028
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9029
  const int64_t nrows = ggml_nrows(tensor);
9030
 
 
9074
 
9075
  // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
9076
  if (ne0 % MATRIX_ROW_PADDING != 0) {
9077
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 
9078
  }
9079
 
9080
  char * buf;
9081
  CUDA_CHECK(cudaMalloc(&buf, size));
9082
+ char * buf_host = (char *)data + offset_split;
9083
 
9084
  // set padding to 0 to avoid possible NaN values
9085
  if (size > original_size) {
 
9101
  }
9102
 
9103
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9104
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9105
  return;
9106
  }
9107
 
 
9224
 
9225
  ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
9226
 
9227
+ const bool inplace = tensor->view_src != nullptr;
 
9228
 
9229
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
9230
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
9231
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
9232
  size_t view_offset = 0;
9233
  if (tensor->op == GGML_OP_VIEW) {
 
9307
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
9308
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
9309
 
9310
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
9311
  return false;
9312
  }
9313
 
9314
  if (tensor->op == GGML_OP_MUL_MAT) {
9315
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
9316
  #ifndef NDEBUG
9317
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9318
  #endif
9319
  return false;
9320
  }
 
9443
  return false;
9444
  }
9445
 
9446
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
9447
+ ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
9448
+ }
9449
+
9450
  if (params->ith != 0) {
9451
  return true;
9452
  }
 
9520
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9521
 
9522
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
9523
+ assert(tensor->view_src->buffer->buft == buffer->buft);
9524
  tensor->backend = tensor->view_src->backend;
9525
  tensor->extra = tensor->view_src->extra;
9526
  return;
 
9551
  }
9552
 
9553
  static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
 
9554
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9555
 
9556
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9557
 
9558
+ ggml_cuda_set_device(ctx->device);
9559
+ CUDA_CHECK(cudaDeviceSynchronize());
9560
+
9561
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9562
  }
9563
 
9564
  static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
 
9565
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9566
 
9567
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9568
+
9569
+ ggml_cuda_set_device(ctx->device);
9570
+ CUDA_CHECK(cudaDeviceSynchronize());
9571
+
9572
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
9573
+ }
9574
 
9575
+ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
9576
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9577
+
9578
+ ggml_cuda_set_device(ctx->device);
9579
+ CUDA_CHECK(cudaDeviceSynchronize());
9580
+
9581
+ CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
9582
  }
9583
 
9584
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
 
9589
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
9590
  /* .cpy_tensor_from = */ NULL,
9591
  /* .cpy_tensor_to = */ NULL,
9592
+ /* .clear = */ ggml_backend_cuda_buffer_clear,
9593
  };
9594
 
9595
  // cuda buffer type
 
9626
 
9627
  if (ggml_is_quantized(tensor->type)) {
9628
  if (ne0 % MATRIX_ROW_PADDING != 0) {
9629
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 
9630
  }
9631
  }
9632
 
 
9641
  UNUSED(buft);
9642
  }
9643
 
9644
+ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
9645
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
9646
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
9647
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
9648
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9649
+ /* .is_host = */ nullptr,
9650
  };
9651
 
9652
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9653
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
9654
+
9655
+ static bool ggml_backend_cuda_buffer_type_initialized = false;
9656
+
9657
+ if (!ggml_backend_cuda_buffer_type_initialized) {
9658
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9659
+ ggml_backend_cuda_buffer_types[i] = {
9660
+ /* .iface = */ ggml_backend_cuda_buffer_type_interface,
9661
  /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
9662
  };
9663
  }
9664
+ ggml_backend_cuda_buffer_type_initialized = true;
9665
  }
9666
 
9667
+ return &ggml_backend_cuda_buffer_types[device];
9668
  }
9669
 
9670
  // host buffer type
9671
 
9672
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9673
+ CUDA_CHECK(cudaFreeHost(buffer->context));
 
 
9674
  }
9675
 
9676
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 
9683
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
9684
 
9685
  return buffer;
 
 
9686
  }
9687
 
 
 
 
 
 
 
 
9688
  ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9689
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9690
+ /* .iface = */ {
9691
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9692
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9693
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9694
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9695
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
9696
+ },
9697
  /* .context = */ nullptr,
9698
  };
9699
 
9700
+ return &ggml_backend_cuda_buffer_type_host;
9701
  }
9702
 
9703
  // backend
 
9729
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9730
 
9731
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
 
 
9732
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9733
 
9734
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
 
9738
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9739
 
9740
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
 
 
9741
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9742
 
9743
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
ggml-metal.h CHANGED
@@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
98
 
99
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
100
 
 
 
101
  GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 
102
  GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
103
 
104
  // helper to check if the device supports a specific family
 
98
 
99
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
100
 
101
+ GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
102
+
103
  GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
104
+
105
  GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
106
 
107
  // helper to check if the device supports a specific family
ggml-metal.m CHANGED
@@ -180,7 +180,15 @@ struct ggml_metal_context {
180
  @implementation GGMLMetalClass
181
  @end
182
 
183
- ggml_log_callback ggml_metal_log_callback = NULL;
 
 
 
 
 
 
 
 
184
  void * ggml_metal_log_user_data = NULL;
185
 
186
  void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
@@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
607
  }
608
 
609
  // temporarily defined here for compatibility between ggml-backend and the old API
610
- struct ggml_backend_metal_buffer_context {
611
- void * data;
 
 
612
 
613
  id<MTLBuffer> metal;
614
  };
615
 
 
 
 
 
 
 
 
 
 
 
616
  // finds the Metal buffer that contains the tensor data on the GPU device
617
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
618
  // Metal buffer based on the host memory pointer
@@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
622
 
623
  const int64_t tsize = ggml_nbytes(t);
624
 
 
 
625
  // compatibility with ggml-backend
626
- if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
627
- struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
 
 
 
 
628
 
629
- const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
 
 
630
 
631
- GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
 
 
 
 
632
 
633
- *offs = (size_t) ioffs;
634
 
635
- return buf_ctx->metal;
636
  }
637
 
638
  // find the view that contains the tensor fully
@@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
1261
  {
1262
  GGML_ASSERT(ggml_is_contiguous(src0));
1263
 
1264
- const float scale = *(const float *) src1->data;
1265
 
1266
  int64_t n = ggml_nelements(dst);
1267
 
@@ -1272,8 +1304,8 @@ void ggml_metal_graph_compute(
1272
  [encoder setComputePipelineState:ctx->pipeline_scale];
1273
  }
1274
 
1275
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1276
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1277
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
1278
 
1279
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
@@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
2361
 
2362
  // backend interface
2363
 
 
2364
  static id<MTLDevice> g_backend_device = nil;
2365
  static int g_backend_device_ref_count = 0;
2366
 
@@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
2388
  static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2389
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2390
 
2391
- return ctx->data;
2392
  }
2393
 
2394
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2395
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2396
 
2397
- [ctx->metal release];
 
 
2398
  ggml_backend_metal_free_device();
2399
 
2400
- free(ctx->data);
2401
- free(ctx);
 
2402
 
2403
- UNUSED(buffer);
2404
  }
2405
 
2406
  static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2407
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
2408
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
2409
-
2410
  memcpy((char *)tensor->data + offset, data, size);
2411
 
2412
  UNUSED(buffer);
2413
  }
2414
 
2415
  static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2416
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
2417
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
2418
-
2419
  memcpy(data, (const char *)tensor->data + offset, size);
2420
 
2421
  UNUSED(buffer);
@@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
2433
  UNUSED(buffer);
2434
  }
2435
 
2436
- static struct ggml_backend_buffer_i metal_backend_buffer_i = {
 
 
 
 
 
 
2437
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
2438
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
2439
  /* .init_tensor = */ NULL,
@@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
2441
  /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
2442
  /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
2443
  /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
 
2444
  };
2445
 
 
 
2446
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2447
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2448
 
@@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
2453
  size_aligned += (size_page - (size_aligned % size_page));
2454
  }
2455
 
2456
- ctx->data = ggml_metal_host_malloc(size);
2457
- ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
 
 
 
 
 
 
 
 
2458
  length:size_aligned
2459
  options:MTLResourceStorageModeShared
2460
  deallocator:nil];
2461
 
2462
- return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2463
  }
2464
 
2465
  static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
2470
  static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
2471
  return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
2472
 
2473
- GGML_UNUSED(buft);
 
 
 
 
 
 
2474
  }
2475
 
2476
  ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2480
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
2481
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
2482
  /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
 
2483
  },
2484
  /* .context = */ NULL,
2485
  };
@@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2487
  return &ggml_backend_buffer_type_metal;
2488
  }
2489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2490
  static const char * ggml_backend_metal_name(ggml_backend_t backend) {
2491
  return "Metal";
2492
 
@@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
2499
  free(backend);
2500
  }
2501
 
2502
- static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
2503
- UNUSED(backend);
2504
- }
2505
-
2506
  static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
2507
  return ggml_backend_metal_buffer_type();
2508
 
@@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
2529
  /* .get_tensor_async = */ NULL,
2530
  /* .cpy_tensor_from_async = */ NULL,
2531
  /* .cpy_tensor_to_async = */ NULL,
2532
- /* .synchronize = */ ggml_backend_metal_synchronize,
2533
- /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
2534
  /* .graph_plan_free = */ NULL,
2535
  /* .graph_plan_compute = */ NULL,
2536
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2537
  /* .supports_op = */ ggml_backend_metal_supports_op,
2538
  };
2539
 
2540
- // TODO: make a common log callback for all backends in ggml-backend
2541
- static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
2542
- fprintf(stderr, "%s", msg);
2543
-
2544
- UNUSED(level);
2545
- UNUSED(user_data);
2546
- }
2547
-
2548
  ggml_backend_t ggml_backend_metal_init(void) {
2549
- ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
2550
-
2551
  struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
2552
 
2553
  if (ctx == NULL) {
 
180
  @implementation GGMLMetalClass
181
  @end
182
 
183
+
184
+ static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
185
+ fprintf(stderr, "%s", msg);
186
+
187
+ UNUSED(level);
188
+ UNUSED(user_data);
189
+ }
190
+
191
+ ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
192
  void * ggml_metal_log_user_data = NULL;
193
 
194
  void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
 
615
  }
616
 
617
  // temporarily defined here for compatibility between ggml-backend and the old API
618
+
619
+ struct ggml_backend_metal_buffer {
620
+ void * data;
621
+ size_t size;
622
 
623
  id<MTLBuffer> metal;
624
  };
625
 
626
+ struct ggml_backend_metal_buffer_context {
627
+ void * all_data;
628
+ size_t all_size;
629
+ bool owned;
630
+
631
+ // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
632
+ int n_buffers;
633
+ struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
634
+ };
635
+
636
  // finds the Metal buffer that contains the tensor data on the GPU device
637
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
638
  // Metal buffer based on the host memory pointer
 
642
 
643
  const int64_t tsize = ggml_nbytes(t);
644
 
645
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
646
+
647
  // compatibility with ggml-backend
648
+ if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
649
+ struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
650
+
651
+ // find the view that contains the tensor fully
652
+ for (int i = 0; i < buf_ctx->n_buffers; ++i) {
653
+ const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
654
 
655
+ //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
656
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
657
+ *offs = (size_t) ioffs;
658
 
659
+ //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
660
+
661
+ return buf_ctx->buffers[i].metal;
662
+ }
663
+ }
664
 
665
+ GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
666
 
667
+ return nil;
668
  }
669
 
670
  // find the view that contains the tensor fully
 
1293
  {
1294
  GGML_ASSERT(ggml_is_contiguous(src0));
1295
 
1296
+ const float scale = *(const float *) dst->op_params;
1297
 
1298
  int64_t n = ggml_nelements(dst);
1299
 
 
1304
  [encoder setComputePipelineState:ctx->pipeline_scale];
1305
  }
1306
 
1307
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1308
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1309
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
1310
 
1311
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
 
2393
 
2394
  // backend interface
2395
 
2396
+ // default buffer
2397
  static id<MTLDevice> g_backend_device = nil;
2398
  static int g_backend_device_ref_count = 0;
2399
 
 
2421
  static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2422
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2423
 
2424
+ return ctx->all_data;
2425
  }
2426
 
2427
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2428
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2429
 
2430
+ for (int i = 0; i < ctx->n_buffers; i++) {
2431
+ [ctx->buffers[i].metal release];
2432
+ }
2433
  ggml_backend_metal_free_device();
2434
 
2435
+ if (ctx->owned) {
2436
+ free(ctx->all_data);
2437
+ }
2438
 
2439
+ free(ctx);
2440
  }
2441
 
2442
  static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
 
 
2443
  memcpy((char *)tensor->data + offset, data, size);
2444
 
2445
  UNUSED(buffer);
2446
  }
2447
 
2448
  static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
 
 
2449
  memcpy(data, (const char *)tensor->data + offset, size);
2450
 
2451
  UNUSED(buffer);
 
2463
  UNUSED(buffer);
2464
  }
2465
 
2466
+ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2467
+ struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2468
+
2469
+ memset(ctx->all_data, value, ctx->all_size);
2470
+ }
2471
+
2472
+ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
2473
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
2474
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
2475
  /* .init_tensor = */ NULL,
 
2477
  /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
2478
  /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
2479
  /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
2480
+ /* .clear = */ ggml_backend_metal_buffer_clear,
2481
  };
2482
 
2483
+ // default buffer type
2484
+
2485
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2486
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2487
 
 
2492
  size_aligned += (size_page - (size_aligned % size_page));
2493
  }
2494
 
2495
+ id<MTLDevice> device = ggml_backend_metal_get_device();
2496
+
2497
+ ctx->all_data = ggml_metal_host_malloc(size_aligned);
2498
+ ctx->all_size = size_aligned;
2499
+ ctx->owned = true;
2500
+ ctx->n_buffers = 1;
2501
+
2502
+ ctx->buffers[0].data = ctx->all_data;
2503
+ ctx->buffers[0].size = size;
2504
+ ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
2505
  length:size_aligned
2506
  options:MTLResourceStorageModeShared
2507
  deallocator:nil];
2508
 
2509
+ if (ctx->buffers[0].metal == nil) {
2510
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2511
+ free(ctx);
2512
+ ggml_backend_metal_free_device();
2513
+ return NULL;
2514
+ }
2515
+
2516
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
2517
+
2518
+
2519
+ #if TARGET_OS_OSX
2520
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
2521
+ device.currentAllocatedSize / 1024.0 / 1024.0,
2522
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
2523
+
2524
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
2525
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
2526
+ } else {
2527
+ GGML_METAL_LOG_INFO("\n");
2528
+ }
2529
+ #else
2530
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
2531
+ #endif
2532
+
2533
+
2534
+ return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
2535
  }
2536
 
2537
  static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
 
2542
  static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
2543
  return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
2544
 
2545
+ UNUSED(buft);
2546
+ }
2547
+
2548
+ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2549
+ return true;
2550
+
2551
+ UNUSED(buft);
2552
  }
2553
 
2554
  ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 
2558
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
2559
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
2560
  /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
2561
+ /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
2562
  },
2563
  /* .context = */ NULL,
2564
  };
 
2566
  return &ggml_backend_buffer_type_metal;
2567
  }
2568
 
2569
+ // buffer from ptr
2570
+
2571
+ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
2572
+ struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2573
+
2574
+ ctx->all_data = data;
2575
+ ctx->all_size = size;
2576
+ ctx->owned = false;
2577
+ ctx->n_buffers = 0;
2578
+
2579
+ const size_t size_page = sysconf(_SC_PAGESIZE);
2580
+ size_t size_aligned = size;
2581
+ if ((size_aligned % size_page) != 0) {
2582
+ size_aligned += (size_page - (size_aligned % size_page));
2583
+ }
2584
+
2585
+ id<MTLDevice> device = ggml_backend_metal_get_device();
2586
+
2587
+ // the buffer fits into the max buffer size allowed by the device
2588
+ if (size_aligned <= device.maxBufferLength) {
2589
+ ctx->buffers[ctx->n_buffers].data = data;
2590
+ ctx->buffers[ctx->n_buffers].size = size;
2591
+
2592
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
2593
+
2594
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
2595
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2596
+ return false;
2597
+ }
2598
+
2599
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
2600
+
2601
+ ++ctx->n_buffers;
2602
+ } else {
2603
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
2604
+ // one of the views
2605
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
2606
+ const size_t size_step = device.maxBufferLength - size_ovlp;
2607
+ const size_t size_view = device.maxBufferLength;
2608
+
2609
+ for (size_t i = 0; i < size; i += size_step) {
2610
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
2611
+
2612
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
2613
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
2614
+
2615
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
2616
+
2617
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
2618
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
2619
+ return false;
2620
+ }
2621
+
2622
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
2623
+ if (i + size_step < size) {
2624
+ GGML_METAL_LOG_INFO("\n");
2625
+ }
2626
+
2627
+ ++ctx->n_buffers;
2628
+ }
2629
+ }
2630
+
2631
+ #if TARGET_OS_OSX
2632
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
2633
+ device.currentAllocatedSize / 1024.0 / 1024.0,
2634
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
2635
+
2636
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
2637
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
2638
+ } else {
2639
+ GGML_METAL_LOG_INFO("\n");
2640
+ }
2641
+ #else
2642
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
2643
+ #endif
2644
+
2645
+ return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
2646
+ }
2647
+
2648
+ // backend
2649
+
2650
  static const char * ggml_backend_metal_name(ggml_backend_t backend) {
2651
  return "Metal";
2652
 
 
2659
  free(backend);
2660
  }
2661
 
 
 
 
 
2662
  static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
2663
  return ggml_backend_metal_buffer_type();
2664
 
 
2685
  /* .get_tensor_async = */ NULL,
2686
  /* .cpy_tensor_from_async = */ NULL,
2687
  /* .cpy_tensor_to_async = */ NULL,
2688
+ /* .synchronize = */ NULL,
2689
+ /* .graph_plan_create = */ NULL,
2690
  /* .graph_plan_free = */ NULL,
2691
  /* .graph_plan_compute = */ NULL,
2692
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2693
  /* .supports_op = */ ggml_backend_metal_supports_op,
2694
  };
2695
 
 
 
 
 
 
 
 
 
2696
  ggml_backend_t ggml_backend_metal_init(void) {
 
 
2697
  struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
2698
 
2699
  if (ctx == NULL) {
ggml-metal.metal CHANGED
@@ -1702,8 +1702,9 @@ kernel void kernel_rope(
1702
  dst_data[1] = x0*sin_theta + x1*cos_theta;
1703
  }
1704
  } else {
1705
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
1706
- for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
 
1707
 
1708
  // simplified from `(ib * n_dims + ic) * inv_ndims`
1709
  const float cur_rot = inv_ndims*ic - ib;
@@ -1722,6 +1723,14 @@ kernel void kernel_rope(
1722
 
1723
  dst_data[0] = x0*cos_theta - x1*sin_theta;
1724
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
 
 
 
 
 
 
 
 
1725
  }
1726
  }
1727
  }
 
1702
  dst_data[1] = x0*sin_theta + x1*cos_theta;
1703
  }
1704
  } else {
1705
+ for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
1706
+ if (ic < n_dims) {
1707
+ const int64_t ib = 0;
1708
 
1709
  // simplified from `(ib * n_dims + ic) * inv_ndims`
1710
  const float cur_rot = inv_ndims*ic - ib;
 
1723
 
1724
  dst_data[0] = x0*cos_theta - x1*sin_theta;
1725
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
1726
+ } else {
1727
+ const int64_t i0 = ic;
1728
+
1729
+ device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
1730
+ device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
1731
+
1732
+ dst_data[0] = src[0];
1733
+ dst_data[1] = src[1];
1734
  }
1735
  }
1736
  }
ggml-quants.c CHANGED
@@ -3677,7 +3677,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3677
 
3678
  const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
3679
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
3680
- const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
3681
  const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
3682
  vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
3683
  const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
@@ -6626,7 +6626,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6626
 
6627
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
6628
  const int8x16_t scales = vld1q_s8(scale);
6629
- const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
6630
 
6631
  const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
6632
  vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
 
3677
 
3678
  const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
3679
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
3680
+ const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
3681
  const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
3682
  vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
3683
  const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
 
6626
 
6627
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
6628
  const int8x16_t scales = vld1q_s8(scale);
6629
+ const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
6630
 
6631
  const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
6632
  vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
ggml.c CHANGED
@@ -1997,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
1997
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1998
  }
1999
 
2000
- size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
2001
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2002
-
2003
- return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
2004
- }
2005
-
2006
  int ggml_blck_size(enum ggml_type type) {
2007
  return type_traits[type].blck_size;
2008
  }
@@ -2011,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
2011
  return type_traits[type].type_size;
2012
  }
2013
 
2014
- float ggml_type_sizef(enum ggml_type type) {
2015
- return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
 
 
 
 
 
2016
  }
2017
 
2018
  const char * ggml_type_name(enum ggml_type type) {
@@ -2049,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
2049
  return ggml_type_size(tensor->type);
2050
  }
2051
 
2052
- static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2053
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2054
 
2055
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2056
  }
2057
 
2058
- static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
2059
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2060
 
2061
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2062
  }
2063
 
2064
- static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2065
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2066
 
2067
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
2068
  }
2069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2070
  static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2071
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2072
 
@@ -2371,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
2371
  size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
2372
  size_t max_size = 0;
2373
 
2374
- struct ggml_object * obj = ctx->objects_begin;
2375
-
2376
- while (obj != NULL) {
2377
- if (obj->type == GGML_OBJECT_TENSOR) {
2378
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
2379
-
2380
- const size_t size = ggml_nbytes(tensor);
2381
-
2382
- if (max_size < size) {
2383
- max_size = size;
2384
- }
2385
- }
2386
-
2387
- obj = obj->next;
2388
  }
2389
 
2390
  return max_size;
@@ -2473,7 +2473,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2473
  view_src = view_src->view_src;
2474
  }
2475
 
2476
- size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
2477
  for (int i = 1; i < n_dims; i++) {
2478
  data_size *= ne[i];
2479
  }
@@ -2516,7 +2516,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2516
  /*.type =*/ type,
2517
  /*.backend =*/ GGML_BACKEND_CPU,
2518
  /*.buffer =*/ NULL,
2519
- /*.n_dims =*/ n_dims,
2520
  /*.ne =*/ { 1, 1, 1, 1 },
2521
  /*.nb =*/ { 0, 0, 0, 0 },
2522
  /*.op =*/ GGML_OP_NONE,
@@ -2623,7 +2622,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
2623
  }
2624
 
2625
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
2626
- return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
2627
  }
2628
 
2629
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
@@ -3072,7 +3071,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
3072
  struct ggml_tensor * ggml_view_tensor(
3073
  struct ggml_context * ctx,
3074
  struct ggml_tensor * src) {
3075
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
3076
  ggml_format_name(result, "%s (view)", src->name);
3077
 
3078
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -3082,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
3082
  return result;
3083
  }
3084
 
3085
- struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3086
  struct ggml_object * obj = ctx->objects_begin;
3087
 
3088
  char * const mem_buffer = ctx->mem_buffer;
@@ -3098,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3098
  return NULL;
3099
  }
3100
 
3101
- struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
3102
  struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
3103
  obj = obj->next;
3104
 
@@ -3230,10 +3229,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
3230
  is_node = true;
3231
  }
3232
 
3233
- struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
3234
 
3235
  result->op = GGML_OP_ADD;
3236
- result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
3237
  result->src[0] = a;
3238
  result->src[1] = b;
3239
 
@@ -3602,12 +3601,12 @@ struct ggml_tensor * ggml_sum_rows(
3602
  is_node = true;
3603
  }
3604
 
3605
- int64_t ne[4] = {1,1,1,1};
3606
- for (int i=1; i<a->n_dims; ++i) {
3607
  ne[i] = a->ne[i];
3608
  }
3609
 
3610
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
3611
 
3612
  result->op = GGML_OP_SUM_ROWS;
3613
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3628,8 +3627,8 @@ struct ggml_tensor * ggml_mean(
3628
  is_node = true;
3629
  }
3630
 
3631
- int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3632
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
3633
 
3634
  result->op = GGML_OP_MEAN;
3635
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3651,8 +3650,7 @@ struct ggml_tensor * ggml_argmax(
3651
  is_node = true;
3652
  }
3653
 
3654
- int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
3655
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
3656
 
3657
  result->op = GGML_OP_ARGMAX;
3658
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3675,7 +3673,7 @@ struct ggml_tensor * ggml_repeat(
3675
  is_node = true;
3676
  }
3677
 
3678
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3679
 
3680
  result->op = GGML_OP_REPEAT;
3681
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3702,7 +3700,7 @@ struct ggml_tensor * ggml_repeat_back(
3702
  return a;
3703
  }
3704
 
3705
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3706
 
3707
  result->op = GGML_OP_REPEAT_BACK;
3708
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4078,7 +4076,7 @@ struct ggml_tensor * ggml_mul_mat(
4078
  }
4079
 
4080
  const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4081
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4082
 
4083
  result->op = GGML_OP_MUL_MAT;
4084
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4088,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat(
4088
  return result;
4089
  }
4090
 
 
 
 
 
 
 
 
 
4091
  // ggml_mul_mat_id
4092
 
4093
  struct ggml_tensor * ggml_mul_mat_id(
@@ -4112,7 +4118,7 @@ struct ggml_tensor * ggml_mul_mat_id(
4112
  }
4113
 
4114
  const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4115
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
4116
 
4117
  ggml_set_op_params_i32(result, 0, id);
4118
  ggml_set_op_params_i32(result, 1, n_as);
@@ -4150,7 +4156,7 @@ struct ggml_tensor * ggml_out_prod(
4150
 
4151
  // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
4152
  const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
4153
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4154
 
4155
  result->op = GGML_OP_OUT_PROD;
4156
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4165,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
4165
  static struct ggml_tensor * ggml_scale_impl(
4166
  struct ggml_context * ctx,
4167
  struct ggml_tensor * a,
4168
- struct ggml_tensor * b,
4169
  bool inplace) {
4170
- GGML_ASSERT(ggml_is_scalar(b));
4171
  GGML_ASSERT(ggml_is_padded_1d(a));
4172
 
4173
  bool is_node = false;
4174
 
4175
- if (a->grad || b->grad) {
4176
  is_node = true;
4177
  }
4178
 
4179
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4180
 
 
 
4181
  result->op = GGML_OP_SCALE;
4182
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4183
  result->src[0] = a;
4184
- result->src[1] = b;
4185
 
4186
  return result;
4187
  }
@@ -4189,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
4189
  struct ggml_tensor * ggml_scale(
4190
  struct ggml_context * ctx,
4191
  struct ggml_tensor * a,
4192
- struct ggml_tensor * b) {
4193
- return ggml_scale_impl(ctx, a, b, false);
4194
  }
4195
 
4196
  struct ggml_tensor * ggml_scale_inplace(
4197
  struct ggml_context * ctx,
4198
  struct ggml_tensor * a,
4199
- struct ggml_tensor * b) {
4200
- return ggml_scale_impl(ctx, a, b, true);
4201
  }
4202
 
4203
  // ggml_set
@@ -4435,7 +4441,7 @@ struct ggml_tensor * ggml_reshape(
4435
  //GGML_ASSERT(false);
4436
  }
4437
 
4438
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
4439
  ggml_format_name(result, "%s (reshaped)", a->name);
4440
 
4441
  result->op = GGML_OP_RESHAPE;
@@ -4813,7 +4819,7 @@ struct ggml_tensor * ggml_diag(
4813
  }
4814
 
4815
  const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
4816
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
4817
 
4818
  result->op = GGML_OP_DIAG;
4819
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5460,7 +5466,7 @@ struct ggml_tensor * ggml_pool_1d(
5460
  is_node = true;
5461
  }
5462
 
5463
- const int64_t ne[3] = {
5464
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5465
  a->ne[1],
5466
  };
@@ -5579,7 +5585,7 @@ struct ggml_tensor * ggml_argsort(
5579
  enum ggml_sort_order order) {
5580
  bool is_node = false;
5581
 
5582
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
5583
 
5584
  ggml_set_op_params_i32(result, 0, (int32_t) order);
5585
 
@@ -5626,7 +5632,7 @@ struct ggml_tensor * ggml_flash_attn(
5626
  }
5627
 
5628
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
5629
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
5630
 
5631
  int32_t t = masked ? 1 : 0;
5632
  ggml_set_op_params(result, &t, sizeof(t));
@@ -5659,7 +5665,7 @@ struct ggml_tensor * ggml_flash_ff(
5659
  }
5660
 
5661
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5662
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
5663
 
5664
  result->op = GGML_OP_FLASH_FF;
5665
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5775,7 +5781,6 @@ struct ggml_tensor * ggml_win_part(
5775
  const int np = npx*npy;
5776
 
5777
  const int64_t ne[4] = { a->ne[0], w, w, np, };
5778
-
5779
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5780
 
5781
  int32_t params[] = { npx, npy, w };
@@ -7759,10 +7764,10 @@ static void ggml_compute_forward_mul_f32(
7759
  const int ith = params->ith;
7760
  const int nth = params->nth;
7761
 
7762
- // TODO: OpenCL kernel support broadcast
7763
  #ifdef GGML_USE_CLBLAST
7764
  if (src1->backend == GGML_BACKEND_GPU) {
7765
- GGML_ASSERT(ggml_are_same_shape(src0, src1));
 
7766
  if (ith == 0) {
7767
  ggml_cl_mul(src0, src1, dst);
7768
  }
@@ -9159,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
9159
  float eps;
9160
  memcpy(&eps, dst->op_params, sizeof(float));
9161
 
 
 
9162
  // TODO: optimize
9163
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9164
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9228,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
9228
  float eps;
9229
  memcpy(&eps, dst->op_params, sizeof(float));
9230
 
 
 
9231
  // TODO: optimize
9232
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9233
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9571,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9571
  }
9572
  #endif
9573
 
9574
- // off1 = offset in i11 and i1
9575
- // cne1 = ne11 and ne1
9576
- // in a normal matrix multiplication, off1 = 0 and cne1 = ne1
9577
- // during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
9578
  static void ggml_compute_forward_mul_mat(
9579
  const struct ggml_compute_params * params,
9580
  const struct ggml_tensor * src0,
9581
  const struct ggml_tensor * src1,
9582
- struct ggml_tensor * dst,
9583
- int64_t off1, int64_t cne1) {
9584
  int64_t t0 = ggml_perf_time_us();
9585
  UNUSED(t0);
9586
 
@@ -9648,9 +9652,9 @@ static void ggml_compute_forward_mul_mat(
9648
  const int64_t i03 = i13/r3;
9649
  const int64_t i02 = i12/r2;
9650
 
9651
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9652
- const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
9653
- float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
9654
 
9655
  if (type != GGML_TYPE_F32) {
9656
  float * const wdata = params->wdata;
@@ -9667,7 +9671,7 @@ static void ggml_compute_forward_mul_mat(
9667
  }
9668
 
9669
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9670
- cne1, ne01, ne10,
9671
  1.0f, y, ne10,
9672
  x, ne00,
9673
  0.0f, d, ne01);
@@ -9683,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
9683
  if (params->type == GGML_TASK_INIT) {
9684
  if (src1->type != vec_dot_type) {
9685
  char * wdata = params->wdata;
9686
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9687
 
9688
  assert(params->wsize >= ne11*ne12*ne13*row_size);
9689
  assert(src1->type == GGML_TYPE_F32);
@@ -9706,10 +9710,10 @@ static void ggml_compute_forward_mul_mat(
9706
  }
9707
 
9708
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9709
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9710
 
9711
- const int64_t nr0 = ne01; // src0 rows
9712
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9713
 
9714
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9715
 
@@ -9751,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
9751
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9752
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9753
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9754
- const int64_t i13 = (ir1/(ne12*cne1));
9755
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9756
- const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
9757
 
9758
  // broadcast src0 into src1
9759
  const int64_t i03 = i13/r3;
@@ -9793,28 +9797,191 @@ static void ggml_compute_forward_mul_mat(
9793
 
9794
  static void ggml_compute_forward_mul_mat_id(
9795
  const struct ggml_compute_params * params,
9796
- const struct ggml_tensor * src0,
9797
  const struct ggml_tensor * src1,
9798
  struct ggml_tensor * dst) {
9799
 
9800
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9801
- // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
9802
- ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
9803
- return;
9804
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9805
 
9806
- const struct ggml_tensor * ids = src0;
 
 
 
 
 
 
 
 
 
 
9807
  const int id = ggml_get_op_params_i32(dst, 0);
9808
  const int n_as = ggml_get_op_params_i32(dst, 1);
9809
 
9810
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9811
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9812
 
9813
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
 
 
 
9814
 
9815
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
9816
- ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
9817
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9818
  }
9819
 
9820
  // ggml_compute_forward_out_prod
@@ -10158,19 +10325,18 @@ static void ggml_compute_forward_out_prod(
10158
  static void ggml_compute_forward_scale_f32(
10159
  const struct ggml_compute_params * params,
10160
  const struct ggml_tensor * src0,
10161
- const struct ggml_tensor * src1,
10162
  struct ggml_tensor * dst) {
10163
  GGML_ASSERT(ggml_is_contiguous(src0));
10164
  GGML_ASSERT(ggml_is_contiguous(dst));
10165
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10166
- GGML_ASSERT(ggml_is_scalar(src1));
10167
 
10168
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10169
  return;
10170
  }
10171
 
10172
  // scale factor
10173
- const float v = *(float *) src1->data;
 
10174
 
10175
  const int ith = params->ith;
10176
  const int nth = params->nth;
@@ -10201,12 +10367,11 @@ static void ggml_compute_forward_scale_f32(
10201
  static void ggml_compute_forward_scale(
10202
  const struct ggml_compute_params * params,
10203
  const struct ggml_tensor * src0,
10204
- const struct ggml_tensor * src1,
10205
  struct ggml_tensor * dst) {
10206
  switch (src0->type) {
10207
  case GGML_TYPE_F32:
10208
  {
10209
- ggml_compute_forward_scale_f32(params, src0, src1, dst);
10210
  } break;
10211
  default:
10212
  {
@@ -11395,10 +11560,13 @@ static void ggml_compute_forward_rope_f32(
11395
  }
11396
  } else {
11397
  // TODO: this might be wrong for ne0 != n_dims - need double check
11398
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
 
11399
  theta_base *= freq_scale;
11400
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11401
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
 
 
11402
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11403
  float cur_rot = inv_ndims * ic - ib;
11404
 
@@ -11421,6 +11589,14 @@ static void ggml_compute_forward_rope_f32(
11421
 
11422
  dst_data[0] = x0*cos_theta - x1*sin_theta;
11423
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
 
 
 
 
 
 
 
 
11424
  }
11425
  }
11426
  }
@@ -11548,10 +11724,13 @@ static void ggml_compute_forward_rope_f16(
11548
  }
11549
  } else {
11550
  // TODO: this might be wrong for ne0 != n_dims - need double check
11551
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
 
11552
  theta_base *= freq_scale;
11553
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11554
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
 
 
11555
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11556
  float cur_rot = inv_ndims * ic - ib;
11557
 
@@ -11574,6 +11753,14 @@ static void ggml_compute_forward_rope_f16(
11574
 
11575
  dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
11576
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
 
 
 
 
 
 
 
 
11577
  }
11578
  }
11579
  }
@@ -14182,7 +14369,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14182
  } break;
14183
  case GGML_OP_MUL_MAT:
14184
  {
14185
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
14186
  } break;
14187
  case GGML_OP_MUL_MAT_ID:
14188
  {
@@ -14194,7 +14381,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14194
  } break;
14195
  case GGML_OP_SCALE:
14196
  {
14197
- ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
14198
  } break;
14199
  case GGML_OP_SET:
14200
  {
@@ -14558,7 +14745,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14558
  return replacements->vals[i];
14559
  }
14560
 
14561
- struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14562
 
14563
  // insert clone into replacements
14564
  GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
@@ -14650,7 +14837,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
14650
 
14651
  static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14652
  if (ggml_hash_contains(zero_table, a)) {
14653
- struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14654
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14655
  } else {
14656
  return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
@@ -14786,7 +14973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14786
  src0->grad,
14787
  ggml_scale(ctx,
14788
  ggml_mul(ctx, src0, tensor->grad),
14789
- ggml_new_f32(ctx, 2.0f)),
14790
  zero_table);
14791
  }
14792
  } break;
@@ -14800,7 +14987,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14800
  ggml_div(ctx,
14801
  tensor->grad,
14802
  tensor),
14803
- ggml_new_f32(ctx, 0.5f)),
14804
  zero_table);
14805
  }
14806
  } break;
@@ -14966,17 +15153,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14966
  {
14967
  // necessary for llama
14968
  if (src0->grad) {
 
 
 
14969
  src0->grad =
14970
  ggml_add_or_set(ctx,
14971
  src0->grad,
14972
- ggml_scale_impl(ctx, tensor->grad, src1, false),
14973
- zero_table);
14974
- }
14975
- if (src1->grad) {
14976
- src1->grad =
14977
- ggml_add_or_set(ctx,
14978
- src1->grad,
14979
- ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
14980
  zero_table);
14981
  }
14982
  } break;
@@ -15154,6 +15337,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15154
  const int n_past = ((int32_t *) tensor->op_params)[0];
15155
  src0->grad =
15156
  ggml_add_or_set(ctx, src0->grad,
 
 
15157
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15158
  zero_table);
15159
  }
@@ -15982,7 +16167,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15982
  } break;
15983
  case GGML_OP_MUL_MAT_ID:
15984
  {
15985
- // FIXME: blas
15986
  n_tasks = n_threads;
15987
  } break;
15988
  case GGML_OP_OUT_PROD:
@@ -16311,25 +16495,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16311
  } else
16312
  #endif
16313
  if (node->src[1]->type != vec_dot_type) {
16314
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16315
  }
16316
  } break;
16317
  case GGML_OP_MUL_MAT_ID:
16318
  {
16319
- const struct ggml_tensor * a = node->src[2];
16320
- const struct ggml_tensor * b = node->src[1];
16321
- const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
16322
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16323
- if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
16324
- if (a->type != GGML_TYPE_F32) {
16325
- // here we need memory just for single 2D matrix from src0
16326
- cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
16327
- }
16328
- } else
16329
- #endif
16330
- if (b->type != vec_dot_type) {
16331
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
16332
  }
 
 
 
 
16333
  } break;
16334
  case GGML_OP_OUT_PROD:
16335
  {
@@ -16559,7 +16739,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
16559
  fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
16560
  ggml_type_name(tensor->type),
16561
  ggml_op_name (tensor->op),
16562
- tensor->n_dims,
16563
  ne[0], ne[1], ne[2], ne[3],
16564
  nb[0], nb[1], nb[2], nb[3],
16565
  tensor->data,
@@ -16574,7 +16754,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16574
  arg,
16575
  ggml_type_name(tensor->type),
16576
  ggml_op_name (tensor->op),
16577
- tensor->n_dims,
16578
  ne[0], ne[1], ne[2], ne[3],
16579
  nb[0], nb[1], nb[2], nb[3],
16580
  tensor->data,
@@ -16664,11 +16844,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16664
 
16665
  const uint32_t type = tensor->type;
16666
  const uint32_t op = tensor->op;
16667
- const uint32_t n_dims = tensor->n_dims;
16668
 
16669
  fwrite(&type, sizeof(uint32_t), 1, fout);
16670
  fwrite(&op, sizeof(uint32_t), 1, fout);
16671
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16672
 
16673
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16674
  const uint64_t ne = tensor->ne[j];
@@ -16698,11 +16876,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16698
 
16699
  const uint32_t type = tensor->type;
16700
  const uint32_t op = tensor->op;
16701
- const uint32_t n_dims = tensor->n_dims;
16702
 
16703
  fwrite(&type, sizeof(uint32_t), 1, fout);
16704
  fwrite(&op, sizeof(uint32_t), 1, fout);
16705
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16706
 
16707
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16708
  const uint64_t ne = tensor->ne[j];
@@ -16874,12 +17050,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16874
  {
16875
  uint32_t type;
16876
  uint32_t op;
16877
- uint32_t n_dims;
16878
 
16879
  for (uint32_t i = 0; i < n_leafs; ++i) {
16880
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16881
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16882
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16883
 
16884
  int64_t ne[GGML_MAX_DIMS];
16885
  size_t nb[GGML_MAX_DIMS];
@@ -16895,7 +17069,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16895
  nb[j] = nb_cur;
16896
  }
16897
 
16898
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
16899
 
16900
  tensor->op = (enum ggml_op) op;
16901
 
@@ -16912,7 +17086,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16912
 
16913
  ptr += ggml_nbytes(tensor);
16914
 
16915
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
16916
  }
16917
  }
16918
 
@@ -16922,12 +17096,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16922
  {
16923
  uint32_t type;
16924
  uint32_t op;
16925
- uint32_t n_dims;
16926
 
16927
  for (uint32_t i = 0; i < n_nodes; ++i) {
16928
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16929
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16930
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16931
 
16932
  enum ggml_op eop = (enum ggml_op) op;
16933
 
@@ -16998,7 +17170,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16998
  } break;
16999
  default:
17000
  {
17001
- tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
17002
 
17003
  tensor->op = eop;
17004
  } break;
@@ -17017,7 +17189,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17017
 
17018
  result->nodes[i] = tensor;
17019
 
17020
- fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17021
  }
17022
  }
17023
  }
@@ -17155,7 +17327,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17155
  fprintf(fp, "(%s)|", ggml_type_name(node->type));
17156
  }
17157
 
17158
- if (node->n_dims == 2) {
17159
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17160
  } else {
17161
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
@@ -17422,7 +17594,7 @@ static enum ggml_opt_result ggml_opt_adam(
17422
  int64_t i = 0;
17423
  for (int p = 0; p < np; ++p) {
17424
  const int64_t ne = ggml_nelements(ps[p]);
17425
- const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
17426
  for (int64_t j = 0; j < ne; ++j) {
17427
  float x = ggml_get_f32_1d(ps[p], j);
17428
  float g_ = g[i]*gnorm;
@@ -18696,7 +18868,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18696
  return NULL;
18697
  }
18698
 
18699
- const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
18700
 
18701
  ctx->size += GGML_PAD(size_cur, ctx->alignment);
18702
  }
@@ -19025,6 +19197,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
19025
  return ctx->infos[i].name.data;
19026
  }
19027
 
 
 
 
 
19028
  // returns the index
19029
  static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
19030
  const int idx = gguf_find_key(ctx, key);
@@ -19200,8 +19376,8 @@ void gguf_add_tensor(
19200
  ctx->infos[idx].ne[i] = 1;
19201
  }
19202
 
19203
- ctx->infos[idx].n_dims = tensor->n_dims;
19204
- for (int i = 0; i < tensor->n_dims; i++) {
19205
  ctx->infos[idx].ne[i] = tensor->ne[i];
19206
  }
19207
 
 
1997
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1998
  }
1999
 
 
 
 
 
 
 
2000
  int ggml_blck_size(enum ggml_type type) {
2001
  return type_traits[type].blck_size;
2002
  }
 
2005
  return type_traits[type].type_size;
2006
  }
2007
 
2008
+ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2009
+ assert(ne % ggml_blck_size(type) == 0);
2010
+ return ggml_type_size(type)*ne/ggml_blck_size(type);
2011
+ }
2012
+
2013
+ double ggml_type_sizef(enum ggml_type type) {
2014
+ return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2015
  }
2016
 
2017
  const char * ggml_type_name(enum ggml_type type) {
 
2048
  return ggml_type_size(tensor->type);
2049
  }
2050
 
2051
+ bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2052
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2053
 
2054
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2055
  }
2056
 
2057
+ bool ggml_is_vector(const struct ggml_tensor * tensor) {
2058
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2059
 
2060
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2061
  }
2062
 
2063
+ bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2064
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2065
 
2066
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
2067
  }
2068
 
2069
+ bool ggml_is_3d(const struct ggml_tensor * tensor) {
2070
+ return tensor->ne[3] == 1;
2071
+ }
2072
+
2073
+ int ggml_n_dims(const struct ggml_tensor * tensor) {
2074
+ for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
2075
+ if (tensor->ne[i] > 1) {
2076
+ return i + 1;
2077
+ }
2078
+ }
2079
+ return 1;
2080
+ }
2081
+
2082
  static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2083
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2084
 
 
2383
  size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
2384
  size_t max_size = 0;
2385
 
2386
+ for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
2387
+ max_size = MAX(max_size, ggml_nbytes(tensor));
 
 
 
 
 
 
 
 
 
 
 
 
2388
  }
2389
 
2390
  return max_size;
 
2473
  view_src = view_src->view_src;
2474
  }
2475
 
2476
+ size_t data_size = ggml_row_size(type, ne[0]);
2477
  for (int i = 1; i < n_dims; i++) {
2478
  data_size *= ne[i];
2479
  }
 
2516
  /*.type =*/ type,
2517
  /*.backend =*/ GGML_BACKEND_CPU,
2518
  /*.buffer =*/ NULL,
 
2519
  /*.ne =*/ { 1, 1, 1, 1 },
2520
  /*.nb =*/ { 0, 0, 0, 0 },
2521
  /*.op =*/ GGML_OP_NONE,
 
2622
  }
2623
 
2624
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
2625
+ return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
2626
  }
2627
 
2628
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
 
3071
  struct ggml_tensor * ggml_view_tensor(
3072
  struct ggml_context * ctx,
3073
  struct ggml_tensor * src) {
3074
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
3075
  ggml_format_name(result, "%s (view)", src->name);
3076
 
3077
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
 
3081
  return result;
3082
  }
3083
 
3084
+ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3085
  struct ggml_object * obj = ctx->objects_begin;
3086
 
3087
  char * const mem_buffer = ctx->mem_buffer;
 
3097
  return NULL;
3098
  }
3099
 
3100
+ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
3101
  struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
3102
  obj = obj->next;
3103
 
 
3229
  is_node = true;
3230
  }
3231
 
3232
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
3233
 
3234
  result->op = GGML_OP_ADD;
3235
+ result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
3236
  result->src[0] = a;
3237
  result->src[1] = b;
3238
 
 
3601
  is_node = true;
3602
  }
3603
 
3604
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
3605
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3606
  ne[i] = a->ne[i];
3607
  }
3608
 
3609
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
3610
 
3611
  result->op = GGML_OP_SUM_ROWS;
3612
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
3627
  is_node = true;
3628
  }
3629
 
3630
+ int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3631
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3632
 
3633
  result->op = GGML_OP_MEAN;
3634
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
3650
  is_node = true;
3651
  }
3652
 
3653
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
 
3654
 
3655
  result->op = GGML_OP_ARGMAX;
3656
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
3673
  is_node = true;
3674
  }
3675
 
3676
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3677
 
3678
  result->op = GGML_OP_REPEAT;
3679
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
3700
  return a;
3701
  }
3702
 
3703
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3704
 
3705
  result->op = GGML_OP_REPEAT_BACK;
3706
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
4076
  }
4077
 
4078
  const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4079
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4080
 
4081
  result->op = GGML_OP_MUL_MAT;
4082
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
4086
  return result;
4087
  }
4088
 
4089
+ void ggml_mul_mat_set_prec(
4090
+ struct ggml_tensor * a,
4091
+ enum ggml_prec prec) {
4092
+ const int32_t prec_i32 = (int32_t) prec;
4093
+
4094
+ ggml_set_op_params_i32(a, 0, prec_i32);
4095
+ }
4096
+
4097
  // ggml_mul_mat_id
4098
 
4099
  struct ggml_tensor * ggml_mul_mat_id(
 
4118
  }
4119
 
4120
  const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4121
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4122
 
4123
  ggml_set_op_params_i32(result, 0, id);
4124
  ggml_set_op_params_i32(result, 1, n_as);
 
4156
 
4157
  // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
4158
  const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
4159
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4160
 
4161
  result->op = GGML_OP_OUT_PROD;
4162
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
4171
  static struct ggml_tensor * ggml_scale_impl(
4172
  struct ggml_context * ctx,
4173
  struct ggml_tensor * a,
4174
+ float s,
4175
  bool inplace) {
 
4176
  GGML_ASSERT(ggml_is_padded_1d(a));
4177
 
4178
  bool is_node = false;
4179
 
4180
+ if (a->grad) {
4181
  is_node = true;
4182
  }
4183
 
4184
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4185
 
4186
+ ggml_set_op_params(result, &s, sizeof(s));
4187
+
4188
  result->op = GGML_OP_SCALE;
4189
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4190
  result->src[0] = a;
 
4191
 
4192
  return result;
4193
  }
 
4195
  struct ggml_tensor * ggml_scale(
4196
  struct ggml_context * ctx,
4197
  struct ggml_tensor * a,
4198
+ float s) {
4199
+ return ggml_scale_impl(ctx, a, s, false);
4200
  }
4201
 
4202
  struct ggml_tensor * ggml_scale_inplace(
4203
  struct ggml_context * ctx,
4204
  struct ggml_tensor * a,
4205
+ float s) {
4206
+ return ggml_scale_impl(ctx, a, s, true);
4207
  }
4208
 
4209
  // ggml_set
 
4441
  //GGML_ASSERT(false);
4442
  }
4443
 
4444
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
4445
  ggml_format_name(result, "%s (reshaped)", a->name);
4446
 
4447
  result->op = GGML_OP_RESHAPE;
 
4819
  }
4820
 
4821
  const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
4822
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
4823
 
4824
  result->op = GGML_OP_DIAG;
4825
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
5466
  is_node = true;
5467
  }
5468
 
5469
+ const int64_t ne[2] = {
5470
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5471
  a->ne[1],
5472
  };
 
5585
  enum ggml_sort_order order) {
5586
  bool is_node = false;
5587
 
5588
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
5589
 
5590
  ggml_set_op_params_i32(result, 0, (int32_t) order);
5591
 
 
5632
  }
5633
 
5634
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
5635
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
5636
 
5637
  int32_t t = masked ? 1 : 0;
5638
  ggml_set_op_params(result, &t, sizeof(t));
 
5665
  }
5666
 
5667
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5668
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
5669
 
5670
  result->op = GGML_OP_FLASH_FF;
5671
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
5781
  const int np = npx*npy;
5782
 
5783
  const int64_t ne[4] = { a->ne[0], w, w, np, };
 
5784
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5785
 
5786
  int32_t params[] = { npx, npy, w };
 
7764
  const int ith = params->ith;
7765
  const int nth = params->nth;
7766
 
 
7767
  #ifdef GGML_USE_CLBLAST
7768
  if (src1->backend == GGML_BACKEND_GPU) {
7769
+ // TODO: OpenCL kernel support full broadcast
7770
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7771
  if (ith == 0) {
7772
  ggml_cl_mul(src0, src1, dst);
7773
  }
 
9164
  float eps;
9165
  memcpy(&eps, dst->op_params, sizeof(float));
9166
 
9167
+ GGML_ASSERT(eps > 0.0f);
9168
+
9169
  // TODO: optimize
9170
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9171
  for (int64_t i02 = 0; i02 < ne02; i02++) {
 
9235
  float eps;
9236
  memcpy(&eps, dst->op_params, sizeof(float));
9237
 
9238
+ GGML_ASSERT(eps > 0.0f);
9239
+
9240
  // TODO: optimize
9241
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9242
  for (int64_t i02 = 0; i02 < ne02; i02++) {
 
9580
  }
9581
  #endif
9582
 
 
 
 
 
9583
  static void ggml_compute_forward_mul_mat(
9584
  const struct ggml_compute_params * params,
9585
  const struct ggml_tensor * src0,
9586
  const struct ggml_tensor * src1,
9587
+ struct ggml_tensor * dst) {
 
9588
  int64_t t0 = ggml_perf_time_us();
9589
  UNUSED(t0);
9590
 
 
9652
  const int64_t i03 = i13/r3;
9653
  const int64_t i02 = i12/r2;
9654
 
9655
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9656
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
9657
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9658
 
9659
  if (type != GGML_TYPE_F32) {
9660
  float * const wdata = params->wdata;
 
9671
  }
9672
 
9673
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9674
+ ne1, ne01, ne10,
9675
  1.0f, y, ne10,
9676
  x, ne00,
9677
  0.0f, d, ne01);
 
9687
  if (params->type == GGML_TASK_INIT) {
9688
  if (src1->type != vec_dot_type) {
9689
  char * wdata = params->wdata;
9690
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9691
 
9692
  assert(params->wsize >= ne11*ne12*ne13*row_size);
9693
  assert(src1->type == GGML_TYPE_F32);
 
9710
  }
9711
 
9712
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9713
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9714
 
9715
+ const int64_t nr0 = ne01; // src0 rows
9716
+ const int64_t nr1 = ne1*ne12*ne13; // src1 rows
9717
 
9718
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9719
 
 
9755
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9756
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9757
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9758
+ const int64_t i13 = (ir1/(ne12*ne1));
9759
+ const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
9760
+ const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
9761
 
9762
  // broadcast src0 into src1
9763
  const int64_t i03 = i13/r3;
 
9797
 
9798
  static void ggml_compute_forward_mul_mat_id(
9799
  const struct ggml_compute_params * params,
9800
+ const struct ggml_tensor * ids,
9801
  const struct ggml_tensor * src1,
9802
  struct ggml_tensor * dst) {
9803
 
9804
+ const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
9805
+
9806
+ GGML_TENSOR_BINARY_OP_LOCALS
9807
+
9808
+ const int ith = params->ith;
9809
+ const int nth = params->nth;
9810
+
9811
+ const enum ggml_type type = src0->type;
9812
+
9813
+ const bool src1_cont = ggml_is_contiguous(src1);
9814
+
9815
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9816
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9817
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
9818
+
9819
+ GGML_ASSERT(ne0 == ne01);
9820
+ GGML_ASSERT(ne1 == ne11);
9821
+ GGML_ASSERT(ne2 == ne12);
9822
+ GGML_ASSERT(ne3 == ne13);
9823
+
9824
+ // we don't support permuted src0 or src1
9825
+ GGML_ASSERT(nb00 == ggml_type_size(type));
9826
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9827
 
9828
+ // dst cannot be transposed or permuted
9829
+ GGML_ASSERT(nb0 == sizeof(float));
9830
+ GGML_ASSERT(nb0 <= nb1);
9831
+ GGML_ASSERT(nb1 <= nb2);
9832
+ GGML_ASSERT(nb2 <= nb3);
9833
+
9834
+ // broadcast factors
9835
+ const int64_t r2 = ne12/ne02;
9836
+ const int64_t r3 = ne13/ne03;
9837
+
9838
+ // row groups
9839
  const int id = ggml_get_op_params_i32(dst, 0);
9840
  const int n_as = ggml_get_op_params_i32(dst, 1);
9841
 
9842
+ char * wdata_src1_end = (src1->type == vec_dot_type) ?
9843
+ (char *) params->wdata :
9844
+ (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
9845
+
9846
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
9847
+ int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
9848
+
9849
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
9850
+
9851
+ if (params->type == GGML_TASK_INIT) {
9852
+ char * wdata = params->wdata;
9853
+ if (src1->type != vec_dot_type) {
9854
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9855
+
9856
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
9857
+ assert(src1->type == GGML_TYPE_F32);
9858
+
9859
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
9860
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
9861
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
9862
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
9863
+ wdata += row_size;
9864
+ }
9865
+ }
9866
+ }
9867
+ }
9868
+
9869
+ // initialize matrix_row_counts
9870
+ GGML_ASSERT(wdata == wdata_src1_end);
9871
+ memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
9872
+
9873
+ // group rows by src0 matrix
9874
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9875
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9876
 
9877
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
9878
+ MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
9879
+ matrix_row_counts[row_id] += 1;
9880
+ }
9881
 
9882
+ return;
 
9883
  }
9884
+
9885
+ if (params->type == GGML_TASK_FINALIZE) {
9886
+ return;
9887
+ }
9888
+
9889
+ // compute each matrix multiplication in sequence
9890
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
9891
+ const int64_t cne1 = matrix_row_counts[cur_a];
9892
+
9893
+ if (cne1 == 0) {
9894
+ continue;
9895
+ }
9896
+
9897
+ const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
9898
+
9899
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9900
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9901
+
9902
+ const int64_t nr0 = ne01; // src0 rows
9903
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9904
+
9905
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9906
+
9907
+ // distribute the thread work across the inner or outer loop based on which one is larger
9908
+
9909
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
9910
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
9911
+
9912
+ const int64_t ith0 = ith % nth0;
9913
+ const int64_t ith1 = ith / nth0;
9914
+
9915
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
9916
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
9917
+
9918
+ const int64_t ir010 = dr0*ith0;
9919
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
9920
+
9921
+ const int64_t ir110 = dr1*ith1;
9922
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
9923
+
9924
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
9925
+
9926
+ // threads with no work simply yield (not sure if it helps)
9927
+ if (ir010 >= ir011 || ir110 >= ir111) {
9928
+ sched_yield();
9929
+ continue;
9930
+ }
9931
+
9932
+ assert(ne12 % ne02 == 0);
9933
+ assert(ne13 % ne03 == 0);
9934
+
9935
+ // block-tiling attempt
9936
+ const int64_t blck_0 = 16;
9937
+ const int64_t blck_1 = 16;
9938
+
9939
+ // attempt to reduce false-sharing (does not seem to make a difference)
9940
+ float tmp[16];
9941
+
9942
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9943
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9944
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9945
+ const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
9946
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9947
+ const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
9948
+ const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
9949
+
9950
+ // broadcast src0 into src1
9951
+ const int64_t i03 = i13/r3;
9952
+ const int64_t i02 = i12/r2;
9953
+
9954
+ const int64_t i1 = i11;
9955
+ const int64_t i2 = i12;
9956
+ const int64_t i3 = i13;
9957
+
9958
+ const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
9959
+
9960
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
9961
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
9962
+ // the original src1 data pointer, so we should index using the indices directly
9963
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
9964
+ const char * src1_col = (const char *) wdata +
9965
+ (src1_cont || src1->type != vec_dot_type
9966
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
9967
+ : (i11*nb11 + i12*nb12 + i13*nb13));
9968
+
9969
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
9970
+
9971
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9972
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
9973
+ //}
9974
+
9975
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9976
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
9977
+ }
9978
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
9979
+ }
9980
+ }
9981
+ }
9982
+ }
9983
+
9984
+ #undef MMID_MATRIX_ROW
9985
  }
9986
 
9987
  // ggml_compute_forward_out_prod
 
10325
  static void ggml_compute_forward_scale_f32(
10326
  const struct ggml_compute_params * params,
10327
  const struct ggml_tensor * src0,
 
10328
  struct ggml_tensor * dst) {
10329
  GGML_ASSERT(ggml_is_contiguous(src0));
10330
  GGML_ASSERT(ggml_is_contiguous(dst));
10331
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
 
10332
 
10333
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10334
  return;
10335
  }
10336
 
10337
  // scale factor
10338
+ float v;
10339
+ memcpy(&v, dst->op_params, sizeof(float));
10340
 
10341
  const int ith = params->ith;
10342
  const int nth = params->nth;
 
10367
  static void ggml_compute_forward_scale(
10368
  const struct ggml_compute_params * params,
10369
  const struct ggml_tensor * src0,
 
10370
  struct ggml_tensor * dst) {
10371
  switch (src0->type) {
10372
  case GGML_TYPE_F32:
10373
  {
10374
+ ggml_compute_forward_scale_f32(params, src0, dst);
10375
  } break;
10376
  default:
10377
  {
 
11560
  }
11561
  } else {
11562
  // TODO: this might be wrong for ne0 != n_dims - need double check
11563
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11564
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11565
  theta_base *= freq_scale;
11566
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11567
+ if (ic < n_dims) {
11568
+ const int64_t ib = 0;
11569
+
11570
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11571
  float cur_rot = inv_ndims * ic - ib;
11572
 
 
11589
 
11590
  dst_data[0] = x0*cos_theta - x1*sin_theta;
11591
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
11592
+ } else {
11593
+ const int64_t i0 = ic;
11594
+
11595
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11596
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11597
+
11598
+ dst_data[0] = src[0];
11599
+ dst_data[1] = src[1];
11600
  }
11601
  }
11602
  }
 
11724
  }
11725
  } else {
11726
  // TODO: this might be wrong for ne0 != n_dims - need double check
11727
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11728
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11729
  theta_base *= freq_scale;
11730
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11731
+ if (ic < n_dims) {
11732
+ const int64_t ib = 0;
11733
+
11734
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11735
  float cur_rot = inv_ndims * ic - ib;
11736
 
 
11753
 
11754
  dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
11755
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
11756
+ } else {
11757
+ const int64_t i0 = ic;
11758
+
11759
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11760
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11761
+
11762
+ dst_data[0] = src[0];
11763
+ dst_data[1] = src[1];
11764
  }
11765
  }
11766
  }
 
14369
  } break;
14370
  case GGML_OP_MUL_MAT:
14371
  {
14372
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
14373
  } break;
14374
  case GGML_OP_MUL_MAT_ID:
14375
  {
 
14381
  } break;
14382
  case GGML_OP_SCALE:
14383
  {
14384
+ ggml_compute_forward_scale(params, tensor->src[0], tensor);
14385
  } break;
14386
  case GGML_OP_SET:
14387
  {
 
14745
  return replacements->vals[i];
14746
  }
14747
 
14748
+ struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
14749
 
14750
  // insert clone into replacements
14751
  GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
 
14837
 
14838
  static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14839
  if (ggml_hash_contains(zero_table, a)) {
14840
+ struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
14841
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14842
  } else {
14843
  return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
 
14973
  src0->grad,
14974
  ggml_scale(ctx,
14975
  ggml_mul(ctx, src0, tensor->grad),
14976
+ 2.0f),
14977
  zero_table);
14978
  }
14979
  } break;
 
14987
  ggml_div(ctx,
14988
  tensor->grad,
14989
  tensor),
14990
+ 0.5f),
14991
  zero_table);
14992
  }
14993
  } break;
 
15153
  {
15154
  // necessary for llama
15155
  if (src0->grad) {
15156
+ float s;
15157
+ memcpy(&s, tensor->op_params, sizeof(float));
15158
+
15159
  src0->grad =
15160
  ggml_add_or_set(ctx,
15161
  src0->grad,
15162
+ ggml_scale_impl(ctx, tensor->grad, s, false),
 
 
 
 
 
 
 
15163
  zero_table);
15164
  }
15165
  } break;
 
15337
  const int n_past = ((int32_t *) tensor->op_params)[0];
15338
  src0->grad =
15339
  ggml_add_or_set(ctx, src0->grad,
15340
+ /* ggml_diag_mask_inf_impl() shouldn't be here */
15341
+ /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
15342
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15343
  zero_table);
15344
  }
 
16167
  } break;
16168
  case GGML_OP_MUL_MAT_ID:
16169
  {
 
16170
  n_tasks = n_threads;
16171
  } break;
16172
  case GGML_OP_OUT_PROD:
 
16495
  } else
16496
  #endif
16497
  if (node->src[1]->type != vec_dot_type) {
16498
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
16499
  }
16500
  } break;
16501
  case GGML_OP_MUL_MAT_ID:
16502
  {
16503
+ const struct ggml_tensor * src0 = node->src[2];
16504
+ const struct ggml_tensor * src1 = node->src[1];
16505
+ const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16506
+ if (src1->type != vec_dot_type) {
16507
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
 
 
 
 
 
 
 
 
16508
  }
16509
+ const int n_as = ggml_get_op_params_i32(node, 1);
16510
+ cur = GGML_PAD(cur, sizeof(int64_t)); // align
16511
+ cur += n_as * sizeof(int64_t); // matrix_row_counts
16512
+ cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16513
  } break;
16514
  case GGML_OP_OUT_PROD:
16515
  {
 
16739
  fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
16740
  ggml_type_name(tensor->type),
16741
  ggml_op_name (tensor->op),
16742
+ ggml_n_dims(tensor),
16743
  ne[0], ne[1], ne[2], ne[3],
16744
  nb[0], nb[1], nb[2], nb[3],
16745
  tensor->data,
 
16754
  arg,
16755
  ggml_type_name(tensor->type),
16756
  ggml_op_name (tensor->op),
16757
+ ggml_n_dims(tensor),
16758
  ne[0], ne[1], ne[2], ne[3],
16759
  nb[0], nb[1], nb[2], nb[3],
16760
  tensor->data,
 
16844
 
16845
  const uint32_t type = tensor->type;
16846
  const uint32_t op = tensor->op;
 
16847
 
16848
  fwrite(&type, sizeof(uint32_t), 1, fout);
16849
  fwrite(&op, sizeof(uint32_t), 1, fout);
 
16850
 
16851
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16852
  const uint64_t ne = tensor->ne[j];
 
16876
 
16877
  const uint32_t type = tensor->type;
16878
  const uint32_t op = tensor->op;
 
16879
 
16880
  fwrite(&type, sizeof(uint32_t), 1, fout);
16881
  fwrite(&op, sizeof(uint32_t), 1, fout);
 
16882
 
16883
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16884
  const uint64_t ne = tensor->ne[j];
 
17050
  {
17051
  uint32_t type;
17052
  uint32_t op;
 
17053
 
17054
  for (uint32_t i = 0; i < n_leafs; ++i) {
17055
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
17056
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
 
17057
 
17058
  int64_t ne[GGML_MAX_DIMS];
17059
  size_t nb[GGML_MAX_DIMS];
 
17069
  nb[j] = nb_cur;
17070
  }
17071
 
17072
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
17073
 
17074
  tensor->op = (enum ggml_op) op;
17075
 
 
17086
 
17087
  ptr += ggml_nbytes(tensor);
17088
 
17089
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17090
  }
17091
  }
17092
 
 
17096
  {
17097
  uint32_t type;
17098
  uint32_t op;
 
17099
 
17100
  for (uint32_t i = 0; i < n_nodes; ++i) {
17101
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
17102
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
 
17103
 
17104
  enum ggml_op eop = (enum ggml_op) op;
17105
 
 
17170
  } break;
17171
  default:
17172
  {
17173
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
17174
 
17175
  tensor->op = eop;
17176
  } break;
 
17189
 
17190
  result->nodes[i] = tensor;
17191
 
17192
+ fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17193
  }
17194
  }
17195
  }
 
17327
  fprintf(fp, "(%s)|", ggml_type_name(node->type));
17328
  }
17329
 
17330
+ if (ggml_is_matrix(node)) {
17331
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17332
  } else {
17333
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
 
17594
  int64_t i = 0;
17595
  for (int p = 0; p < np; ++p) {
17596
  const int64_t ne = ggml_nelements(ps[p]);
17597
+ const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
17598
  for (int64_t j = 0; j < ne; ++j) {
17599
  float x = ggml_get_f32_1d(ps[p], j);
17600
  float g_ = g[i]*gnorm;
 
18868
  return NULL;
18869
  }
18870
 
18871
+ const size_t size_cur = ggml_row_size(info->type, ne);
18872
 
18873
  ctx->size += GGML_PAD(size_cur, ctx->alignment);
18874
  }
 
19197
  return ctx->infos[i].name.data;
19198
  }
19199
 
19200
+ enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
19201
+ return ctx->infos[i].type;
19202
+ }
19203
+
19204
  // returns the index
19205
  static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
19206
  const int idx = gguf_find_key(ctx, key);
 
19376
  ctx->infos[idx].ne[i] = 1;
19377
  }
19378
 
19379
+ ctx->infos[idx].n_dims = ggml_n_dims(tensor);
19380
+ for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
19381
  ctx->infos[idx].ne[i] = tensor->ne[i];
19382
  }
19383
 
ggml.h CHANGED
@@ -303,7 +303,7 @@ extern "C" {
303
 
304
  #if defined(__ARM_NEON) && defined(__CUDACC__)
305
  typedef half ggml_fp16_t;
306
- #elif defined(__ARM_NEON)
307
  typedef __fp16 ggml_fp16_t;
308
  #else
309
  typedef uint16_t ggml_fp16_t;
@@ -343,6 +343,12 @@ extern "C" {
343
  GGML_TYPE_COUNT,
344
  };
345
 
 
 
 
 
 
 
346
  enum ggml_backend_type {
347
  GGML_BACKEND_CPU = 0,
348
  GGML_BACKEND_GPU = 10,
@@ -478,7 +484,8 @@ extern "C" {
478
  enum ggml_log_level {
479
  GGML_LOG_LEVEL_ERROR = 2,
480
  GGML_LOG_LEVEL_WARN = 3,
481
- GGML_LOG_LEVEL_INFO = 4
 
482
  };
483
 
484
  // ggml object
@@ -502,7 +509,6 @@ extern "C" {
502
 
503
  struct ggml_backend_buffer * buffer;
504
 
505
- int n_dims;
506
  int64_t ne[GGML_MAX_DIMS]; // number of elements
507
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
508
  // nb[0] = ggml_type_size(type)
@@ -534,7 +540,7 @@ extern "C" {
534
 
535
  void * extra; // extra things e.g. for ggml-cuda.cu
536
 
537
- char padding[12];
538
  };
539
 
540
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -639,11 +645,14 @@ extern "C" {
639
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
640
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
641
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
642
- GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
643
 
644
- GGML_API int ggml_blck_size (enum ggml_type type);
645
- GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
646
- GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 
 
 
 
647
 
648
  GGML_API const char * ggml_type_name(enum ggml_type type);
649
  GGML_API const char * ggml_op_name (enum ggml_op op);
@@ -662,6 +671,11 @@ extern "C" {
662
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
663
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
664
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
 
 
 
 
 
665
 
666
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
667
 
@@ -722,8 +736,8 @@ extern "C" {
722
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
723
 
724
  // Context tensor enumeration and lookup
725
- GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
726
- GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
727
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
728
 
729
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -1050,6 +1064,12 @@ extern "C" {
1050
  struct ggml_tensor * a,
1051
  struct ggml_tensor * b);
1052
 
 
 
 
 
 
 
1053
  // indirect matrix multiplication
1054
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1055
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
@@ -1075,13 +1095,13 @@ extern "C" {
1075
  GGML_API struct ggml_tensor * ggml_scale(
1076
  struct ggml_context * ctx,
1077
  struct ggml_tensor * a,
1078
- struct ggml_tensor * b);
1079
 
1080
  // in-place, returns view(a)
1081
  GGML_API struct ggml_tensor * ggml_scale_inplace(
1082
  struct ggml_context * ctx,
1083
  struct ggml_tensor * a,
1084
- struct ggml_tensor * b);
1085
 
1086
  // b -> view(a,offset,nb1,nb2,3), return modified a
1087
  GGML_API struct ggml_tensor * ggml_set(
@@ -2116,10 +2136,11 @@ extern "C" {
2116
  GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2117
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
2118
 
2119
- GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2120
- GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2121
- GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2122
- GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
 
2123
 
2124
  // overrides existing values or adds a new one
2125
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
 
303
 
304
  #if defined(__ARM_NEON) && defined(__CUDACC__)
305
  typedef half ggml_fp16_t;
306
+ #elif defined(__ARM_NEON) && !defined(_MSC_VER)
307
  typedef __fp16 ggml_fp16_t;
308
  #else
309
  typedef uint16_t ggml_fp16_t;
 
343
  GGML_TYPE_COUNT,
344
  };
345
 
346
+ // precision
347
+ enum ggml_prec {
348
+ GGML_PREC_DEFAULT,
349
+ GGML_PREC_F32,
350
+ };
351
+
352
  enum ggml_backend_type {
353
  GGML_BACKEND_CPU = 0,
354
  GGML_BACKEND_GPU = 10,
 
484
  enum ggml_log_level {
485
  GGML_LOG_LEVEL_ERROR = 2,
486
  GGML_LOG_LEVEL_WARN = 3,
487
+ GGML_LOG_LEVEL_INFO = 4,
488
+ GGML_LOG_LEVEL_DEBUG = 5
489
  };
490
 
491
  // ggml object
 
509
 
510
  struct ggml_backend_buffer * buffer;
511
 
 
512
  int64_t ne[GGML_MAX_DIMS]; // number of elements
513
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
514
  // nb[0] = ggml_type_size(type)
 
540
 
541
  void * extra; // extra things e.g. for ggml-cuda.cu
542
 
543
+ char padding[8];
544
  };
545
 
546
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
645
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
646
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
647
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
 
648
 
649
+ GGML_API int ggml_blck_size(enum ggml_type type);
650
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
651
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
652
+
653
+ GGML_DEPRECATED(
654
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
655
+ "use ggml_row_size() instead");
656
 
657
  GGML_API const char * ggml_type_name(enum ggml_type type);
658
  GGML_API const char * ggml_op_name (enum ggml_op op);
 
671
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
672
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
673
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
674
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
675
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
676
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
677
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
678
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
679
 
680
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
681
 
 
736
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
737
 
738
  // Context tensor enumeration and lookup
739
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
740
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
741
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
742
 
743
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
 
1064
  struct ggml_tensor * a,
1065
  struct ggml_tensor * b);
1066
 
1067
+ // change the precision of a matrix multiplication
1068
+ // set to GGML_PREC_F32 for higher precision (useful for phi-2)
1069
+ GGML_API void ggml_mul_mat_set_prec(
1070
+ struct ggml_tensor * a,
1071
+ enum ggml_prec prec);
1072
+
1073
  // indirect matrix multiplication
1074
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1075
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
 
1095
  GGML_API struct ggml_tensor * ggml_scale(
1096
  struct ggml_context * ctx,
1097
  struct ggml_tensor * a,
1098
+ float s);
1099
 
1100
  // in-place, returns view(a)
1101
  GGML_API struct ggml_tensor * ggml_scale_inplace(
1102
  struct ggml_context * ctx,
1103
  struct ggml_tensor * a,
1104
+ float s);
1105
 
1106
  // b -> view(a,offset,nb1,nb2,3), return modified a
1107
  GGML_API struct ggml_tensor * ggml_set(
 
2136
  GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2137
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
2138
 
2139
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2140
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2141
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2142
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2143
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2144
 
2145
  // overrides existing values or adds a new one
2146
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
whisper.cpp CHANGED
@@ -487,8 +487,8 @@ static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
487
 
488
  // measure the memory usage of a graph and prepare the allocr's internal data buffer
489
  static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
490
- auto & alloc = allocr.alloc;
491
- auto & meta = allocr.meta;
492
 
493
  alloc = ggml_allocr_new_measure_from_backend(backend);
494
 
@@ -1777,7 +1777,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
1777
 
1778
  ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
1779
 
1780
- ggml_allocr * alloc = wstate.alloc_encode.alloc;
1781
 
1782
  //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
1783
  //ggml_allocr_alloc(alloc, cur);
@@ -1787,13 +1787,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
1787
  //}
1788
  struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
1789
 
1790
- struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
1791
- ggml_allocr_alloc(alloc, KQscale);
1792
-
1793
- if (!ggml_allocr_is_measure(alloc)) {
1794
- const float val = 1.0f/sqrtf(float(n_state)/n_head);
1795
- ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
1796
- }
1797
 
1798
  // ===================================================================
1799
  // NOTE: experimenting with partial evaluation of the encoder (ignore)
@@ -1843,14 +1837,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
1843
 
1844
  Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
1845
 
1846
- //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1847
 
1848
  // note: no bias for Key
1849
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1850
  layer.attn_k_w,
1851
  cur);
1852
 
1853
- //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1854
 
1855
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1856
  layer.attn_v_w,
@@ -2032,7 +2026,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
2032
 
2033
  ggml_cgraph * gf = ggml_new_graph(ctx0);
2034
 
2035
- ggml_allocr * alloc = wstate.alloc_cross.alloc;
2036
 
2037
  //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
2038
  //ggml_allocr_alloc(alloc, cur);
@@ -2042,13 +2036,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
2042
  //}
2043
  struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
2044
 
2045
- struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2046
- ggml_allocr_alloc(alloc, Kscale);
2047
-
2048
- if (!ggml_allocr_is_measure(alloc)) {
2049
- const float val = pow(float(n_state) / n_head, -0.25);
2050
- ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float));
2051
- }
2052
 
2053
  for (int il = 0; il < model.hparams.n_text_layer; ++il) {
2054
  auto & layer = model.layers_decoder[il];
@@ -2207,13 +2195,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
2207
  }
2208
  }
2209
 
2210
- struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2211
- ggml_allocr_alloc(alloc, KQscale);
2212
-
2213
- if (!ggml_allocr_is_measure(alloc)) {
2214
- const float val = pow(float(n_state)/n_head, -0.25);
2215
- ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
2216
- }
2217
 
2218
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2219
  ggml_allocr_alloc(alloc, KQ_mask);
@@ -6128,7 +6110,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6128
 
6129
  // multi-thread
6130
 
6131
- for (uint32_t k = 1; k <= n_threads; k++) {
6132
  char * src = (char *) malloc(size);
6133
  char * dst = (char *) malloc(size);
6134
 
@@ -6152,13 +6134,13 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6152
  const int64_t t0 = ggml_time_us();
6153
 
6154
  std::vector<std::thread> threads(k - 1);
6155
- for (uint32_t th = 0; th < k - 1; ++th) {
6156
  threads[th] = std::thread(helper, th);
6157
  }
6158
 
6159
  helper(k - 1);
6160
 
6161
- for (uint32_t th = 0; th < k - 1; ++th) {
6162
  threads[th].join();
6163
  }
6164
 
 
487
 
488
  // measure the memory usage of a graph and prepare the allocr's internal data buffer
489
  static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
490
+ auto & alloc = allocr.alloc;
491
+ auto & meta = allocr.meta;
492
 
493
  alloc = ggml_allocr_new_measure_from_backend(backend);
494
 
 
1777
 
1778
  ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
1779
 
1780
+ //ggml_allocr * alloc = wstate.alloc_encode.alloc;
1781
 
1782
  //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
1783
  //ggml_allocr_alloc(alloc, cur);
 
1787
  //}
1788
  struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
1789
 
1790
+ const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
 
 
 
 
 
 
1791
 
1792
  // ===================================================================
1793
  // NOTE: experimenting with partial evaluation of the encoder (ignore)
 
1837
 
1838
  Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
1839
 
1840
+ //Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));
1841
 
1842
  // note: no bias for Key
1843
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1844
  layer.attn_k_w,
1845
  cur);
1846
 
1847
+ //Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));
1848
 
1849
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1850
  layer.attn_v_w,
 
2026
 
2027
  ggml_cgraph * gf = ggml_new_graph(ctx0);
2028
 
2029
+ //ggml_allocr * alloc = wstate.alloc_cross.alloc;
2030
 
2031
  //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
2032
  //ggml_allocr_alloc(alloc, cur);
 
2036
  //}
2037
  struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
2038
 
2039
+ const float Kscale = pow(float(n_state) / n_head, -0.25);
 
 
 
 
 
 
2040
 
2041
  for (int il = 0; il < model.hparams.n_text_layer; ++il) {
2042
  auto & layer = model.layers_decoder[il];
 
2195
  }
2196
  }
2197
 
2198
+ const float KQscale = pow(float(n_state)/n_head, -0.25);
 
 
 
 
 
 
2199
 
2200
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2201
  ggml_allocr_alloc(alloc, KQ_mask);
 
6110
 
6111
  // multi-thread
6112
 
6113
+ for (int32_t k = 1; k <= n_threads; k++) {
6114
  char * src = (char *) malloc(size);
6115
  char * dst = (char *) malloc(size);
6116
 
 
6134
  const int64_t t0 = ggml_time_us();
6135
 
6136
  std::vector<std::thread> threads(k - 1);
6137
+ for (int32_t th = 0; th < k - 1; ++th) {
6138
  threads[th] = std::thread(helper, th);
6139
  }
6140
 
6141
  helper(k - 1);
6142
 
6143
+ for (int32_t th = 0; th < k - 1; ++th) {
6144
  threads[th].join();
6145
  }
6146