ggerganov commited on
Commit
d128cb3
·
unverified ·
1 Parent(s): 6a472b5

talk-llama : llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -987,6 +987,7 @@ struct llama_mmap {
987
  }
988
 
989
  if (prefetch > 0) {
 
990
  // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
991
  BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
992
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@@ -1004,6 +1005,9 @@ struct llama_mmap {
1004
  llama_format_win_err(GetLastError()).c_str());
1005
  }
1006
  }
 
 
 
1007
  }
1008
  }
1009
 
@@ -1110,7 +1114,7 @@ struct llama_mlock {
1110
  suggest = false;
1111
  }
1112
 
1113
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
1114
  size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
1115
  return false;
1116
  }
@@ -1119,7 +1123,7 @@ struct llama_mlock {
1119
 
1120
  static void raw_unlock(void * addr, size_t size) {
1121
  if (munlock(addr, size)) {
1122
- fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
1123
  }
1124
  }
1125
  #elif defined(_WIN32)
@@ -1137,7 +1141,7 @@ struct llama_mlock {
1137
  return true;
1138
  }
1139
  if (tries == 2) {
1140
- fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
1141
  len, size, llama_format_win_err(GetLastError()).c_str());
1142
  return false;
1143
  }
@@ -1146,7 +1150,7 @@ struct llama_mlock {
1146
  // set size and try again.
1147
  SIZE_T min_ws_size, max_ws_size;
1148
  if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
1149
- fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
1150
  llama_format_win_err(GetLastError()).c_str());
1151
  return false;
1152
  }
@@ -1159,7 +1163,7 @@ struct llama_mlock {
1159
  min_ws_size += increment;
1160
  max_ws_size += increment;
1161
  if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
1162
- fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
1163
  llama_format_win_err(GetLastError()).c_str());
1164
  return false;
1165
  }
@@ -1168,7 +1172,7 @@ struct llama_mlock {
1168
 
1169
  static void raw_unlock(void * ptr, size_t len) {
1170
  if (!VirtualUnlock(ptr, len)) {
1171
- fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
1172
  llama_format_win_err(GetLastError()).c_str());
1173
  }
1174
  }
@@ -1180,7 +1184,7 @@ struct llama_mlock {
1180
  }
1181
 
1182
  bool raw_lock(const void * addr, size_t len) const {
1183
- fprintf(stderr, "warning: mlock not supported on this system\n");
1184
  return false;
1185
  }
1186
 
@@ -2081,13 +2085,13 @@ namespace GGUFMeta {
2081
  __func__, override_type_to_str(override->tag), override->key);
2082
  switch (override->tag) {
2083
  case LLAMA_KV_OVERRIDE_BOOL: {
2084
- printf("%s\n", override->bool_value ? "true" : "false");
2085
  } break;
2086
  case LLAMA_KV_OVERRIDE_INT: {
2087
- printf("%" PRId64 "\n", override->int_value);
2088
  } break;
2089
  case LLAMA_KV_OVERRIDE_FLOAT: {
2090
- printf("%.6f\n", override->float_value);
2091
  } break;
2092
  default:
2093
  // Shouldn't be possible to end up here, but just in case...
@@ -6989,7 +6993,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
6989
  if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
6990
 
6991
  #ifdef PRETOKENIZERDEBUG
6992
- fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
6993
  #endif
6994
  auto source = std::distance(buffer.begin(), it);
6995
 
@@ -7002,7 +7006,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7002
  buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
7003
 
7004
  #ifdef PRETOKENIZERDEBUG
7005
- fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
7006
  #endif
7007
  it++;
7008
  }
@@ -7018,7 +7022,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7018
  buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
7019
 
7020
  #ifdef PRETOKENIZERDEBUG
7021
- fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
7022
  #endif
7023
 
7024
  it++;
@@ -7034,7 +7038,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7034
  raw_text_base_length = right_reminder_length;
7035
 
7036
  #ifdef PRETOKENIZERDEBUG
7037
- fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7038
  #endif
7039
  } else {
7040
  if (source == 0) {
@@ -7091,7 +7095,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7091
  }
7092
 
7093
  #ifdef PRETOKENIZERDEBUG
7094
- fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7095
  #endif
7096
  llm_tokenizer_spm tokenizer(vocab);
7097
  llama_escape_whitespace(raw_text);
@@ -7112,7 +7116,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7112
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7113
 
7114
  #ifdef PRETOKENIZERDEBUG
7115
- fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7116
  #endif
7117
  llm_tokenizer_bpe tokenizer(vocab);
7118
  tokenizer.tokenize(raw_text, output);
@@ -8429,9 +8433,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8429
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8430
  new_type = GGML_TYPE_Q8_0;
8431
  }
 
 
 
8432
  else if (new_type != GGML_TYPE_Q8_0) {
8433
  new_type = GGML_TYPE_Q6_K;
8434
  }
 
 
 
 
 
 
 
 
 
 
 
8435
  } else if (name.find("attn_v.weight") != std::string::npos) {
8436
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8437
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8462,13 +8480,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8462
  new_type = GGML_TYPE_Q8_0;
8463
  }
8464
  } else if (name.find("ffn_down") != std::string::npos) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8465
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8466
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8467
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8468
  }
8469
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8470
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8471
- : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8472
  : GGML_TYPE_Q3_K;
8473
  }
8474
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -8476,22 +8512,29 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8476
  }
8477
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8478
  if (arch == LLM_ARCH_FALCON) {
8479
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8480
- use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8481
  } else {
8482
- if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8483
  }
8484
  }
8485
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8486
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
8487
  new_type = GGML_TYPE_Q5_K;
8488
  }
8489
  ++qs.i_feed_forward_w2;
8490
  } else if (name.find("attn_output.weight") != std::string::npos) {
8491
  if (arch != LLM_ARCH_FALCON) {
8492
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8493
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8494
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
 
 
 
 
 
 
 
8495
  } else {
8496
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
8497
  }
@@ -8594,6 +8637,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8594
  if (params->only_copy) {
8595
  ftype = model.ftype;
8596
  }
 
 
 
 
 
 
 
8597
 
8598
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
8599
  struct gguf_context * ctx_out = gguf_init_empty();
@@ -8651,6 +8701,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8651
  // placeholder for the meta data
8652
  ::zeros(fout, meta_size);
8653
 
 
 
8654
  for (int i = 0; i < ml.n_tensors; ++i) {
8655
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
8656
 
@@ -8703,6 +8755,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8703
  } else {
8704
  const size_t nelements = ggml_nelements(tensor);
8705
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8706
  float * f32_data;
8707
 
8708
  if (tensor->type == GGML_TYPE_F32) {
@@ -8723,21 +8804,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8723
  new_data = work.data();
8724
  std::array<int64_t, 1 << 4> hist_cur = {};
8725
 
8726
- static const int chunk_size = 32 * 512;
 
 
 
 
 
8727
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
8728
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
8729
  if (nthread_use < 2) {
8730
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
8731
  } else {
8732
- size_t counter = 0;
8733
  new_size = 0;
8734
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
 
8735
  std::array<int64_t, 1 << 4> local_hist = {};
 
8736
  size_t local_size = 0;
8737
  while (true) {
8738
  std::unique_lock<std::mutex> lock(mutex);
8739
- size_t first = counter; counter += chunk_size;
8740
- if (first >= nelements) {
8741
  if (local_size > 0) {
8742
  for (int j=0; j<int(local_hist.size()); ++j) {
8743
  hist_cur[j] += local_hist[j];
@@ -8747,8 +8835,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8747
  break;
8748
  }
8749
  lock.unlock();
8750
- size_t last = std::min(nelements, first + chunk_size);
8751
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
 
8752
  }
8753
  };
8754
  for (int it = 0; it < nthread_use - 1; ++it) {
@@ -8759,7 +8848,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8759
  workers.clear();
8760
  }
8761
 
8762
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
8763
  int64_t tot_count = 0;
8764
  for (size_t i = 0; i < hist_cur.size(); i++) {
8765
  hist_all[i] += hist_cur[i];
@@ -8767,6 +8856,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8767
  }
8768
 
8769
  if (tot_count > 0) {
 
8770
  for (size_t i = 0; i < hist_cur.size(); i++) {
8771
  LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
8772
  }
@@ -8795,6 +8885,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8795
 
8796
  fout.close();
8797
 
 
 
 
 
8798
  gguf_free(ctx_out);
8799
 
8800
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9159,6 +9253,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9159
  /*.quantize_output_tensor =*/ true,
9160
  /*.only_copy =*/ false,
9161
  /*.pure =*/ false,
 
9162
  };
9163
 
9164
  return result;
 
987
  }
988
 
989
  if (prefetch > 0) {
990
+ #if _WIN32_WINNT >= 0x602
991
  // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
992
  BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
993
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
 
1005
  llama_format_win_err(GetLastError()).c_str());
1006
  }
1007
  }
1008
+ #else
1009
+ throw std::runtime_error("PrefetchVirtualMemory unavailable");
1010
+ #endif
1011
  }
1012
  }
1013
 
 
1114
  suggest = false;
1115
  }
1116
 
1117
+ LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
1118
  size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
1119
  return false;
1120
  }
 
1123
 
1124
  static void raw_unlock(void * addr, size_t size) {
1125
  if (munlock(addr, size)) {
1126
+ LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
1127
  }
1128
  }
1129
  #elif defined(_WIN32)
 
1141
  return true;
1142
  }
1143
  if (tries == 2) {
1144
+ LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
1145
  len, size, llama_format_win_err(GetLastError()).c_str());
1146
  return false;
1147
  }
 
1150
  // set size and try again.
1151
  SIZE_T min_ws_size, max_ws_size;
1152
  if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
1153
+ LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
1154
  llama_format_win_err(GetLastError()).c_str());
1155
  return false;
1156
  }
 
1163
  min_ws_size += increment;
1164
  max_ws_size += increment;
1165
  if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
1166
+ LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
1167
  llama_format_win_err(GetLastError()).c_str());
1168
  return false;
1169
  }
 
1172
 
1173
  static void raw_unlock(void * ptr, size_t len) {
1174
  if (!VirtualUnlock(ptr, len)) {
1175
+ LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
1176
  llama_format_win_err(GetLastError()).c_str());
1177
  }
1178
  }
 
1184
  }
1185
 
1186
  bool raw_lock(const void * addr, size_t len) const {
1187
+ LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
1188
  return false;
1189
  }
1190
 
 
2085
  __func__, override_type_to_str(override->tag), override->key);
2086
  switch (override->tag) {
2087
  case LLAMA_KV_OVERRIDE_BOOL: {
2088
+ LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2089
  } break;
2090
  case LLAMA_KV_OVERRIDE_INT: {
2091
+ LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2092
  } break;
2093
  case LLAMA_KV_OVERRIDE_FLOAT: {
2094
+ LLAMA_LOG_INFO("%.6f\n", override->float_value);
2095
  } break;
2096
  default:
2097
  // Shouldn't be possible to end up here, but just in case...
 
6993
  if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
6994
 
6995
  #ifdef PRETOKENIZERDEBUG
6996
+ LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
6997
  #endif
6998
  auto source = std::distance(buffer.begin(), it);
6999
 
 
7006
  buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
7007
 
7008
  #ifdef PRETOKENIZERDEBUG
7009
+ LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
7010
  #endif
7011
  it++;
7012
  }
 
7022
  buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
7023
 
7024
  #ifdef PRETOKENIZERDEBUG
7025
+ LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
7026
  #endif
7027
 
7028
  it++;
 
7038
  raw_text_base_length = right_reminder_length;
7039
 
7040
  #ifdef PRETOKENIZERDEBUG
7041
+ LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7042
  #endif
7043
  } else {
7044
  if (source == 0) {
 
7095
  }
7096
 
7097
  #ifdef PRETOKENIZERDEBUG
7098
+ LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7099
  #endif
7100
  llm_tokenizer_spm tokenizer(vocab);
7101
  llama_escape_whitespace(raw_text);
 
7116
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7117
 
7118
  #ifdef PRETOKENIZERDEBUG
7119
+ LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7120
  #endif
7121
  llm_tokenizer_bpe tokenizer(vocab);
7122
  tokenizer.tokenize(raw_text, output);
 
8433
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8434
  new_type = GGML_TYPE_Q8_0;
8435
  }
8436
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8437
+ new_type = GGML_TYPE_Q5_K;
8438
+ }
8439
  else if (new_type != GGML_TYPE_Q8_0) {
8440
  new_type = GGML_TYPE_Q6_K;
8441
  }
8442
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8443
+ if (name.find("attn_v.weight") != std::string::npos) {
8444
+ if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
8445
+ else new_type = GGML_TYPE_Q2_K;
8446
+ ++qs.i_attention_wv;
8447
+ }
8448
+ else if (name.find("ffn_down") != std::string::npos) {
8449
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
8450
+ ++qs.i_feed_forward_w2;
8451
+ }
8452
+ else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8453
  } else if (name.find("attn_v.weight") != std::string::npos) {
8454
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8455
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
 
8480
  new_type = GGML_TYPE_Q8_0;
8481
  }
8482
  } else if (name.find("ffn_down") != std::string::npos) {
8483
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8484
+ int i_layer, n_layer;
8485
+ if (n_expert == 1) {
8486
+ i_layer = qs.i_feed_forward_w2;
8487
+ n_layer = qs.n_feed_forward_w2;
8488
+ } else {
8489
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8490
+ // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8491
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8492
+ // tensor name.
8493
+ n_layer = qs.n_feed_forward_w2 / n_expert;
8494
+ if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8495
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8496
+ }
8497
+ if (i_layer < 0 || i_layer >= n_layer) {
8498
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8499
+ }
8500
+ }
8501
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8502
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8503
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
8504
  }
8505
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8506
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
8507
+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
8508
  : GGML_TYPE_Q3_K;
8509
  }
8510
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
 
8512
  }
8513
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8514
  if (arch == LLM_ARCH_FALCON) {
8515
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
8516
+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8517
  } else {
8518
+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8519
  }
8520
  }
8521
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8522
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
8523
  new_type = GGML_TYPE_Q5_K;
8524
  }
8525
  ++qs.i_feed_forward_w2;
8526
  } else if (name.find("attn_output.weight") != std::string::npos) {
8527
  if (arch != LLM_ARCH_FALCON) {
8528
+ if (qs.model.hparams.n_expert == 8) {
8529
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8530
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8531
+ new_type = GGML_TYPE_Q5_K;
8532
+ }
8533
+ } else {
8534
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8535
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8536
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8537
+ }
8538
  } else {
8539
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
8540
  }
 
8637
  if (params->only_copy) {
8638
  ftype = model.ftype;
8639
  }
8640
+ const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
8641
+ if (params->imatrix) {
8642
+ imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
8643
+ if (imatrix_data) {
8644
+ LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
8645
+ }
8646
+ }
8647
 
8648
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
8649
  struct gguf_context * ctx_out = gguf_init_empty();
 
8701
  // placeholder for the meta data
8702
  ::zeros(fout, meta_size);
8703
 
8704
+ std::set<ggml_type> used_iq2;
8705
+
8706
  for (int i = 0; i < ml.n_tensors; ++i) {
8707
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
8708
 
 
8755
  } else {
8756
  const size_t nelements = ggml_nelements(tensor);
8757
 
8758
+ if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
8759
+ ggml_init_iq2_quantization(new_type);
8760
+ used_iq2.insert(new_type);
8761
+ }
8762
+
8763
+ const float * imatrix = nullptr;
8764
+ if (imatrix_data) {
8765
+ auto it = imatrix_data->find(tensor->name);
8766
+ if (it == imatrix_data->end()) {
8767
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
8768
+ } else {
8769
+ if (it->second.size() == (size_t)tensor->ne[0]) {
8770
+ imatrix = it->second.data();
8771
+ } else {
8772
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
8773
+ int(it->second.size()), int(tensor->ne[0]), tensor->name);
8774
+ }
8775
+ }
8776
+ }
8777
+ if ((new_type == GGML_TYPE_IQ2_XXS ||
8778
+ new_type == GGML_TYPE_IQ2_XS ||
8779
+ (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
8780
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
8781
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
8782
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
8783
+ LLAMA_LOG_ERROR("============================================================\n\n");
8784
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
8785
+ }
8786
+
8787
  float * f32_data;
8788
 
8789
  if (tensor->type == GGML_TYPE_F32) {
 
8804
  new_data = work.data();
8805
  std::array<int64_t, 1 << 4> hist_cur = {};
8806
 
8807
+ const int n_per_row = tensor->ne[0];
8808
+ const int nrows = nelements / n_per_row;
8809
+
8810
+ static const int min_chunk_size = 32 * 512;
8811
+ const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
8812
+
8813
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
8814
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
8815
  if (nthread_use < 2) {
8816
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
8817
  } else {
8818
+ int counter = 0;
8819
  new_size = 0;
8820
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
8821
+ nrows, n_per_row, imatrix]() {
8822
  std::array<int64_t, 1 << 4> local_hist = {};
8823
+ const int nrows_per_chunk = chunk_size / n_per_row;
8824
  size_t local_size = 0;
8825
  while (true) {
8826
  std::unique_lock<std::mutex> lock(mutex);
8827
+ int first_row = counter; counter += nrows_per_chunk;
8828
+ if (first_row >= nrows) {
8829
  if (local_size > 0) {
8830
  for (int j=0; j<int(local_hist.size()); ++j) {
8831
  hist_cur[j] += local_hist[j];
 
8835
  break;
8836
  }
8837
  lock.unlock();
8838
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
8839
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
8840
+ first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
8841
  }
8842
  };
8843
  for (int it = 0; it < nthread_use - 1; ++it) {
 
8848
  workers.clear();
8849
  }
8850
 
8851
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
8852
  int64_t tot_count = 0;
8853
  for (size_t i = 0; i < hist_cur.size(); i++) {
8854
  hist_all[i] += hist_cur[i];
 
8856
  }
8857
 
8858
  if (tot_count > 0) {
8859
+ LLAMA_LOG_INFO(" | hist: ");
8860
  for (size_t i = 0; i < hist_cur.size(); i++) {
8861
  LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
8862
  }
 
8885
 
8886
  fout.close();
8887
 
8888
+ for (auto type : used_iq2) {
8889
+ ggml_deinit_iq2_quantization(type);
8890
+ }
8891
+
8892
  gguf_free(ctx_out);
8893
 
8894
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
 
9253
  /*.quantize_output_tensor =*/ true,
9254
  /*.only_copy =*/ false,
9255
  /*.pure =*/ false,
9256
+ /*.imatrix =*/ nullptr,
9257
  };
9258
 
9259
  return result;
examples/talk-llama/llama.h CHANGED
@@ -249,6 +249,7 @@ extern "C" {
249
  bool quantize_output_tensor; // quantize output.weight
250
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
251
  bool pure; // disable k-quant mixtures and quantize all tensors to the same type
 
252
  } llama_model_quantize_params;
253
 
254
  // grammar types
 
249
  bool quantize_output_tensor; // quantize output.weight
250
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
251
  bool pure; // disable k-quant mixtures and quantize all tensors to the same type
252
+ void * imatrix; // pointer to importance matrix data
253
  } llama_model_quantize_params;
254
 
255
  // grammar types