ggerganov commited on
Commit
b92d757
·
unverified ·
1 Parent(s): 3eb6cbf

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
509
  {
510
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
511
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
512
- { LLM_TENSOR_OUTPUT, "output" },
513
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
514
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
515
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -851,9 +850,9 @@ struct LLM_TN {
851
  //
852
 
853
  static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
854
- { LLAMA_ROPE_SCALING_NONE, "none" },
855
- { LLAMA_ROPE_SCALING_LINEAR, "linear" },
856
- { LLAMA_ROPE_SCALING_YARN, "yarn" },
857
  };
858
 
859
  static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
@@ -863,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
863
  }
864
  }
865
 
866
- return LLAMA_ROPE_SCALING_UNSPECIFIED;
867
  }
868
 
869
  static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1581,7 +1580,7 @@ struct llama_hparams {
1581
  bool causal_attn = true;
1582
  bool need_kq_pos = false;
1583
 
1584
- uint32_t pooling_type = LLAMA_POOLING_NONE;
1585
 
1586
  bool operator!=(const llama_hparams & other) const {
1587
  if (this->vocab_only != other.vocab_only) return true;
@@ -2346,9 +2345,9 @@ namespace GGUFMeta {
2346
 
2347
  static const char * override_type_to_str(const llama_model_kv_override_type ty) {
2348
  switch (ty) {
2349
- case LLAMA_KV_OVERRIDE_BOOL: return "bool";
2350
- case LLAMA_KV_OVERRIDE_INT: return "int";
2351
- case LLAMA_KV_OVERRIDE_FLOAT: return "float";
2352
  }
2353
  return "unknown";
2354
  }
@@ -2359,13 +2358,13 @@ namespace GGUFMeta {
2359
  LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
2360
  __func__, override_type_to_str(override->tag), override->key);
2361
  switch (override->tag) {
2362
- case LLAMA_KV_OVERRIDE_BOOL: {
2363
  LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2364
  } break;
2365
- case LLAMA_KV_OVERRIDE_INT: {
2366
  LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2367
  } break;
2368
- case LLAMA_KV_OVERRIDE_FLOAT: {
2369
  LLAMA_LOG_INFO("%.6f\n", override->float_value);
2370
  } break;
2371
  default:
@@ -2384,7 +2383,7 @@ namespace GGUFMeta {
2384
  template<typename OT>
2385
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2386
  try_override(OT & target, const struct llama_model_kv_override *override) {
2387
- if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
2388
  target = override->bool_value;
2389
  return true;
2390
  }
@@ -2394,7 +2393,7 @@ namespace GGUFMeta {
2394
  template<typename OT>
2395
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2396
  try_override(OT & target, const struct llama_model_kv_override *override) {
2397
- if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
2398
  target = override->int_value;
2399
  return true;
2400
  }
@@ -2404,7 +2403,7 @@ namespace GGUFMeta {
2404
  template<typename OT>
2405
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2406
  try_override(T & target, const struct llama_model_kv_override *override) {
2407
- if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
2408
  target = override->float_value;
2409
  return true;
2410
  }
@@ -2546,6 +2545,7 @@ struct llama_model_loader {
2546
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2547
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2548
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
 
2549
  default:
2550
  {
2551
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2891,6 +2891,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2891
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2892
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2893
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
 
 
2894
 
2895
  default: return "unknown, may not work";
2896
  }
@@ -2997,7 +2999,7 @@ static void llm_load_hparams(
2997
  std::string rope_scaling("linear");
2998
  ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2999
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
3000
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
3001
 
3002
  // rope_freq_scale (inverse of the kv) is optional
3003
  float ropescale = 0.0f;
@@ -3641,7 +3643,7 @@ static bool llm_load_tensors(
3641
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3642
  }
3643
 
3644
- if (split_mode == LLAMA_SPLIT_LAYER) {
3645
  // calculate the split points
3646
  int device_count = llama_get_device_count();
3647
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3680,10 +3682,10 @@ static bool llm_load_tensors(
3680
  }
3681
  } else {
3682
  ggml_backend_buffer_type_t split_buft;
3683
- if (split_mode == LLAMA_SPLIT_ROW) {
3684
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3685
  } else {
3686
- // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3687
  split_buft = llama_default_buffer_type_offload(main_gpu);
3688
  }
3689
  // assign the repeating layers
@@ -4056,7 +4058,10 @@ static bool llm_load_tensors(
4056
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4057
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4058
 
4059
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
 
 
 
4060
  }
4061
 
4062
  for (int i = 0; i < n_layer; ++i) {
@@ -5065,7 +5070,7 @@ struct llm_build_context {
5065
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5066
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5067
  do_rope_shift (worst_case || kv_self.has_shift),
5068
- pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
5069
  cb (cb),
5070
  buf_compute_meta (lctx.buf_compute_meta) {
5071
  // all initializations should be done in init()
@@ -6045,12 +6050,12 @@ struct llm_build_context {
6045
  cur = inpL;
6046
 
6047
  // pooling layer
6048
- if (pooling_type == LLAMA_POOLING_MEAN) {
6049
  cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6050
- } else if (pooling_type == LLAMA_POOLING_CLS) {
6051
  cur = ggml_get_rows(ctx0, cur, inp_cls);
6052
  } else {
6053
- GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
6054
  }
6055
  cb(cur, "result_embd", -1);
6056
 
@@ -7749,7 +7754,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7749
  }
7750
  }
7751
 
7752
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
7753
  const int64_t n_tokens = batch.n_tokens;
7754
 
7755
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@@ -7777,7 +7782,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7777
  }
7778
  }
7779
 
7780
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7781
  const int64_t n_tokens = batch.n_tokens;
7782
 
7783
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -10542,6 +10547,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10542
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10543
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
10544
  }
 
 
 
 
 
 
10545
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
10546
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10547
  }
@@ -10573,13 +10584,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10573
  new_type = GGML_TYPE_Q8_0;
10574
  }
10575
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10576
- new_type = GGML_TYPE_Q2_K;
 
 
 
 
10577
  }
10578
  } else if (name.find("ffn_down") != std::string::npos) {
10579
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
10580
  int i_layer = info.first, n_layer = info.second;
10581
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10582
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10583
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
10584
  }
10585
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
@@ -10590,6 +10605,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10590
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
10591
  : GGML_TYPE_Q3_K;
10592
  }
 
 
 
 
10593
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
10594
  new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
10595
  }
@@ -10621,37 +10640,41 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10621
  if (qs.model.hparams.n_expert == 8) {
10622
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10623
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10624
- ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
 
10625
  new_type = GGML_TYPE_Q5_K;
10626
  }
10627
  } else {
10628
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10629
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
10630
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
10631
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
 
10632
  }
10633
  } else {
10634
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10635
  }
10636
  }
10637
  else if (name.find("attn_qkv.weight") != std::string::npos) {
10638
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
 
 
10639
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
10640
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
10641
  }
10642
  else if (name.find("ffn_gate") != std::string::npos) {
10643
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
10644
  int i_layer = info.first, n_layer = info.second;
10645
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10646
- new_type = GGML_TYPE_Q2_K;
10647
  }
10648
  ++qs.i_ffn_gate;
10649
  }
10650
  else if (name.find("ffn_up") != std::string::npos) {
10651
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
10652
  int i_layer = info.first, n_layer = info.second;
10653
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10654
- new_type = GGML_TYPE_Q2_K;
10655
  }
10656
  ++qs.i_ffn_up;
10657
  }
@@ -10671,7 +10694,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10671
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10672
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10673
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10674
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10675
  int nx = tensor->ne[0];
10676
  int ny = tensor->ne[1];
10677
  if (nx % QK_K != 0) {
@@ -10686,6 +10709,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10686
  case GGML_TYPE_IQ2_XXS:
10687
  case GGML_TYPE_IQ2_XS:
10688
  case GGML_TYPE_IQ3_XXS:
 
10689
  case GGML_TYPE_IQ1_S:
10690
  case GGML_TYPE_Q2_K:
10691
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
@@ -10717,7 +10741,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10717
  // K-quants
10718
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10719
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10720
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
10721
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10722
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10723
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -10731,6 +10755,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10731
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10732
  case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10733
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
 
 
10734
 
10735
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10736
  }
@@ -11325,7 +11351,7 @@ static int llama_apply_lora_from_file_internal(
11325
  struct llama_model_params llama_model_default_params() {
11326
  struct llama_model_params result = {
11327
  /*.n_gpu_layers =*/ 0,
11328
- /*.split_mode =*/ LLAMA_SPLIT_LAYER,
11329
  /*.main_gpu =*/ 0,
11330
  /*.tensor_split =*/ nullptr,
11331
  /*.progress_callback =*/ nullptr,
@@ -11351,7 +11377,7 @@ struct llama_context_params llama_context_default_params() {
11351
  /*.n_batch =*/ 512,
11352
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11353
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11354
- /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
11355
  /*.rope_freq_base =*/ 0.0f,
11356
  /*.rope_freq_scale =*/ 0.0f,
11357
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11539,16 +11565,16 @@ struct llama_context * llama_new_context_with_model(
11539
  cparams.cb_eval_user_data = params.cb_eval_user_data;
11540
 
11541
  auto rope_scaling_type = params.rope_scaling_type;
11542
- if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
11543
  rope_scaling_type = hparams.rope_scaling_type_train;
11544
  }
11545
 
11546
- if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
11547
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11548
  }
11549
 
11550
  if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11551
- cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
11552
  }
11553
 
11554
  if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11582,8 +11608,8 @@ struct llama_context * llama_new_context_with_model(
11582
  }
11583
  #elif defined(GGML_USE_CUBLAS)
11584
  if (model->n_gpu_layers > 0) {
11585
- // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
11586
- if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
11587
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11588
  if (backend == nullptr) {
11589
  LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11592,7 +11618,7 @@ struct llama_context * llama_new_context_with_model(
11592
  }
11593
  ctx->backends.push_back(backend);
11594
  } else {
11595
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
11596
  for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11597
  ggml_backend_t backend = ggml_backend_cuda_init(device);
11598
  if (backend == nullptr) {
 
509
  {
510
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
511
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
 
512
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
513
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
514
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
 
850
  //
851
 
852
  static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
853
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
854
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
855
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
856
  };
857
 
858
  static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
 
862
  }
863
  }
864
 
865
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
866
  }
867
 
868
  static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
 
1580
  bool causal_attn = true;
1581
  bool need_kq_pos = false;
1582
 
1583
+ uint32_t pooling_type = LLAMA_POOLING_TYPE_NONE;
1584
 
1585
  bool operator!=(const llama_hparams & other) const {
1586
  if (this->vocab_only != other.vocab_only) return true;
 
2345
 
2346
  static const char * override_type_to_str(const llama_model_kv_override_type ty) {
2347
  switch (ty) {
2348
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2349
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2350
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2351
  }
2352
  return "unknown";
2353
  }
 
2358
  LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
2359
  __func__, override_type_to_str(override->tag), override->key);
2360
  switch (override->tag) {
2361
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2362
  LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2363
  } break;
2364
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
2365
  LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2366
  } break;
2367
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2368
  LLAMA_LOG_INFO("%.6f\n", override->float_value);
2369
  } break;
2370
  default:
 
2383
  template<typename OT>
2384
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2385
  try_override(OT & target, const struct llama_model_kv_override *override) {
2386
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, override)) {
2387
  target = override->bool_value;
2388
  return true;
2389
  }
 
2393
  template<typename OT>
2394
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2395
  try_override(OT & target, const struct llama_model_kv_override *override) {
2396
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, override)) {
2397
  target = override->int_value;
2398
  return true;
2399
  }
 
2403
  template<typename OT>
2404
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2405
  try_override(T & target, const struct llama_model_kv_override *override) {
2406
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, override)) {
2407
  target = override->float_value;
2408
  return true;
2409
  }
 
2545
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2548
+ case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
2549
  default:
2550
  {
2551
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
 
2891
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2892
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2893
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2894
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
2895
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
2896
 
2897
  default: return "unknown, may not work";
2898
  }
 
2999
  std::string rope_scaling("linear");
3000
  ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
3001
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
3002
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
3003
 
3004
  // rope_freq_scale (inverse of the kv) is optional
3005
  float ropescale = 0.0f;
 
3643
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3644
  }
3645
 
3646
+ if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
3647
  // calculate the split points
3648
  int device_count = llama_get_device_count();
3649
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
 
3682
  }
3683
  } else {
3684
  ggml_backend_buffer_type_t split_buft;
3685
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
3686
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3687
  } else {
3688
+ // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
3689
  split_buft = llama_default_buffer_type_offload(main_gpu);
3690
  }
3691
  // assign the repeating layers
 
4058
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4059
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4060
 
4061
+ // same as tok_embd, duplicated to allow offloading
4062
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4063
+ ml.n_created--; // artificial tensor
4064
+ ml.size_data += ggml_nbytes(model.output);
4065
  }
4066
 
4067
  for (int i = 0; i < n_layer; ++i) {
 
5070
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5071
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5072
  do_rope_shift (worst_case || kv_self.has_shift),
5073
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_TYPE_NONE),
5074
  cb (cb),
5075
  buf_compute_meta (lctx.buf_compute_meta) {
5076
  // all initializations should be done in init()
 
6050
  cur = inpL;
6051
 
6052
  // pooling layer
6053
+ if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
6054
  cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6055
+ } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
6056
  cur = ggml_get_rows(ctx0, cur, inp_cls);
6057
  } else {
6058
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
6059
  }
6060
  cb(cur, "result_embd", -1);
6061
 
 
7754
  }
7755
  }
7756
 
7757
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7758
  const int64_t n_tokens = batch.n_tokens;
7759
 
7760
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
 
7782
  }
7783
  }
7784
 
7785
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7786
  const int64_t n_tokens = batch.n_tokens;
7787
 
7788
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
 
10547
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10548
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
10549
  }
10550
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
10551
+ new_type = GGML_TYPE_Q4_K;
10552
+ }
10553
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10554
+ new_type = GGML_TYPE_Q4_K;
10555
+ }
10556
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
10557
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10558
  }
 
10584
  new_type = GGML_TYPE_Q8_0;
10585
  }
10586
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10587
+ new_type = GGML_TYPE_IQ3_XXS;
10588
+ }
10589
+ } else if (name.find("attn_q.weight") != std::string::npos) {
10590
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10591
+ new_type = GGML_TYPE_IQ3_XXS;
10592
  }
10593
  } else if (name.find("ffn_down") != std::string::npos) {
10594
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
10595
  int i_layer = info.first, n_layer = info.second;
10596
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10597
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
10598
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
10599
  }
10600
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
 
10605
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
10606
  : GGML_TYPE_Q3_K;
10607
  }
10608
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
10609
+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
10610
+ new_type = GGML_TYPE_Q4_K;
10611
+ }
10612
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
10613
  new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
10614
  }
 
10640
  if (qs.model.hparams.n_expert == 8) {
10641
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10642
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10643
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
10644
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10645
  new_type = GGML_TYPE_Q5_K;
10646
  }
10647
  } else {
10648
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10649
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
10650
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
10651
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
10652
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
10653
  }
10654
  } else {
10655
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10656
  }
10657
  }
10658
  else if (name.find("attn_qkv.weight") != std::string::npos) {
10659
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10660
+ new_type = GGML_TYPE_Q4_K;
10661
+ }
10662
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
10663
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
10664
  }
10665
  else if (name.find("ffn_gate") != std::string::npos) {
10666
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
10667
  int i_layer = info.first, n_layer = info.second;
10668
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10669
+ new_type = GGML_TYPE_IQ3_XXS;
10670
  }
10671
  ++qs.i_ffn_gate;
10672
  }
10673
  else if (name.find("ffn_up") != std::string::npos) {
10674
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
10675
  int i_layer = info.first, n_layer = info.second;
10676
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10677
+ new_type = GGML_TYPE_IQ3_XXS;
10678
  }
10679
  ++qs.i_ffn_up;
10680
  }
 
10694
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10695
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10696
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10697
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
10698
  int nx = tensor->ne[0];
10699
  int ny = tensor->ne[1];
10700
  if (nx % QK_K != 0) {
 
10709
  case GGML_TYPE_IQ2_XXS:
10710
  case GGML_TYPE_IQ2_XS:
10711
  case GGML_TYPE_IQ3_XXS:
10712
+ case GGML_TYPE_IQ3_S:
10713
  case GGML_TYPE_IQ1_S:
10714
  case GGML_TYPE_Q2_K:
10715
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
 
10741
  // K-quants
10742
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10743
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10744
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS: quantized_type = GGML_TYPE_IQ3_S; break;
10745
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10746
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10747
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
 
10755
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10756
  case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10757
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10758
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
10759
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
10760
 
10761
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10762
  }
 
11351
  struct llama_model_params llama_model_default_params() {
11352
  struct llama_model_params result = {
11353
  /*.n_gpu_layers =*/ 0,
11354
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11355
  /*.main_gpu =*/ 0,
11356
  /*.tensor_split =*/ nullptr,
11357
  /*.progress_callback =*/ nullptr,
 
11377
  /*.n_batch =*/ 512,
11378
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11379
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11380
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
11381
  /*.rope_freq_base =*/ 0.0f,
11382
  /*.rope_freq_scale =*/ 0.0f,
11383
  /*.yarn_ext_factor =*/ -1.0f,
 
11565
  cparams.cb_eval_user_data = params.cb_eval_user_data;
11566
 
11567
  auto rope_scaling_type = params.rope_scaling_type;
11568
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
11569
  rope_scaling_type = hparams.rope_scaling_type_train;
11570
  }
11571
 
11572
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
11573
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11574
  }
11575
 
11576
  if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11577
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11578
  }
11579
 
11580
  if (params.seed == LLAMA_DEFAULT_SEED) {
 
11608
  }
11609
  #elif defined(GGML_USE_CUBLAS)
11610
  if (model->n_gpu_layers > 0) {
11611
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
11612
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
11613
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11614
  if (backend == nullptr) {
11615
  LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
 
11618
  }
11619
  ctx->backends.push_back(backend);
11620
  } else {
11621
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
11622
  for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11623
  ggml_backend_t backend = ggml_backend_cuda_init(device);
11624
  if (backend == nullptr) {
examples/talk-llama/llama.h CHANGED
@@ -102,28 +102,30 @@ extern "C" {
102
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
103
  LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
104
  LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
 
 
105
 
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
107
  };
108
 
109
  enum llama_rope_scaling_type {
110
- LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
111
- LLAMA_ROPE_SCALING_NONE = 0,
112
- LLAMA_ROPE_SCALING_LINEAR = 1,
113
- LLAMA_ROPE_SCALING_YARN = 2,
114
- LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
115
  };
116
 
117
  enum llama_pooling_type {
118
- LLAMA_POOLING_NONE = 0,
119
- LLAMA_POOLING_MEAN = 1,
120
- LLAMA_POOLING_CLS = 2,
121
  };
122
 
123
  enum llama_split_mode {
124
- LLAMA_SPLIT_NONE = 0, // single GPU
125
- LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
126
- LLAMA_SPLIT_ROW = 2, // split rows across GPUs
127
  };
128
 
129
  typedef struct llama_token_data {
@@ -171,9 +173,9 @@ extern "C" {
171
  } llama_batch;
172
 
173
  enum llama_model_kv_override_type {
174
- LLAMA_KV_OVERRIDE_INT,
175
- LLAMA_KV_OVERRIDE_FLOAT,
176
- LLAMA_KV_OVERRIDE_BOOL,
177
  };
178
 
179
  struct llama_model_kv_override {
 
102
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
103
  LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
104
  LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
105
+ LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
106
+ LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
107
 
108
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
109
  };
110
 
111
  enum llama_rope_scaling_type {
112
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
113
+ LLAMA_ROPE_SCALING_TYPE_NONE = 0,
114
+ LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
115
+ LLAMA_ROPE_SCALING_TYPE_YARN = 2,
116
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
117
  };
118
 
119
  enum llama_pooling_type {
120
+ LLAMA_POOLING_TYPE_NONE = 0,
121
+ LLAMA_POOLING_TYPE_MEAN = 1,
122
+ LLAMA_POOLING_TYPE_CLS = 2,
123
  };
124
 
125
  enum llama_split_mode {
126
+ LLAMA_SPLIT_MODE_NONE = 0, // single GPU
127
+ LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
128
+ LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
129
  };
130
 
131
  typedef struct llama_token_data {
 
173
  } llama_batch;
174
 
175
  enum llama_model_kv_override_type {
176
+ LLAMA_KV_OVERRIDE_TYPE_INT,
177
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT,
178
+ LLAMA_KV_OVERRIDE_TYPE_BOOL,
179
  };
180
 
181
  struct llama_model_kv_override {