Spaces:
Running
Running
talk-llama : llama.cpp
Browse files- examples/talk-llama/llama.cpp +131 -36
- examples/talk-llama/llama.h +1 -0
examples/talk-llama/llama.cpp
CHANGED
|
@@ -987,6 +987,7 @@ struct llama_mmap {
|
|
| 987 |
}
|
| 988 |
|
| 989 |
if (prefetch > 0) {
|
|
|
|
| 990 |
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
|
| 991 |
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
| 992 |
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
|
@@ -1004,6 +1005,9 @@ struct llama_mmap {
|
|
| 1004 |
llama_format_win_err(GetLastError()).c_str());
|
| 1005 |
}
|
| 1006 |
}
|
|
|
|
|
|
|
|
|
|
| 1007 |
}
|
| 1008 |
}
|
| 1009 |
|
|
@@ -1110,7 +1114,7 @@ struct llama_mlock {
|
|
| 1110 |
suggest = false;
|
| 1111 |
}
|
| 1112 |
|
| 1113 |
-
|
| 1114 |
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
| 1115 |
return false;
|
| 1116 |
}
|
|
@@ -1119,7 +1123,7 @@ struct llama_mlock {
|
|
| 1119 |
|
| 1120 |
static void raw_unlock(void * addr, size_t size) {
|
| 1121 |
if (munlock(addr, size)) {
|
| 1122 |
-
|
| 1123 |
}
|
| 1124 |
}
|
| 1125 |
#elif defined(_WIN32)
|
|
@@ -1137,7 +1141,7 @@ struct llama_mlock {
|
|
| 1137 |
return true;
|
| 1138 |
}
|
| 1139 |
if (tries == 2) {
|
| 1140 |
-
|
| 1141 |
len, size, llama_format_win_err(GetLastError()).c_str());
|
| 1142 |
return false;
|
| 1143 |
}
|
|
@@ -1146,7 +1150,7 @@ struct llama_mlock {
|
|
| 1146 |
// set size and try again.
|
| 1147 |
SIZE_T min_ws_size, max_ws_size;
|
| 1148 |
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
| 1149 |
-
|
| 1150 |
llama_format_win_err(GetLastError()).c_str());
|
| 1151 |
return false;
|
| 1152 |
}
|
|
@@ -1159,7 +1163,7 @@ struct llama_mlock {
|
|
| 1159 |
min_ws_size += increment;
|
| 1160 |
max_ws_size += increment;
|
| 1161 |
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
| 1162 |
-
|
| 1163 |
llama_format_win_err(GetLastError()).c_str());
|
| 1164 |
return false;
|
| 1165 |
}
|
|
@@ -1168,7 +1172,7 @@ struct llama_mlock {
|
|
| 1168 |
|
| 1169 |
static void raw_unlock(void * ptr, size_t len) {
|
| 1170 |
if (!VirtualUnlock(ptr, len)) {
|
| 1171 |
-
|
| 1172 |
llama_format_win_err(GetLastError()).c_str());
|
| 1173 |
}
|
| 1174 |
}
|
|
@@ -1180,7 +1184,7 @@ struct llama_mlock {
|
|
| 1180 |
}
|
| 1181 |
|
| 1182 |
bool raw_lock(const void * addr, size_t len) const {
|
| 1183 |
-
|
| 1184 |
return false;
|
| 1185 |
}
|
| 1186 |
|
|
@@ -2081,13 +2085,13 @@ namespace GGUFMeta {
|
|
| 2081 |
__func__, override_type_to_str(override->tag), override->key);
|
| 2082 |
switch (override->tag) {
|
| 2083 |
case LLAMA_KV_OVERRIDE_BOOL: {
|
| 2084 |
-
|
| 2085 |
} break;
|
| 2086 |
case LLAMA_KV_OVERRIDE_INT: {
|
| 2087 |
-
|
| 2088 |
} break;
|
| 2089 |
case LLAMA_KV_OVERRIDE_FLOAT: {
|
| 2090 |
-
|
| 2091 |
} break;
|
| 2092 |
default:
|
| 2093 |
// Shouldn't be possible to end up here, but just in case...
|
|
@@ -6989,7 +6993,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
| 6989 |
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
| 6990 |
|
| 6991 |
#ifdef PRETOKENIZERDEBUG
|
| 6992 |
-
|
| 6993 |
#endif
|
| 6994 |
auto source = std::distance(buffer.begin(), it);
|
| 6995 |
|
|
@@ -7002,7 +7006,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
| 7002 |
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
| 7003 |
|
| 7004 |
#ifdef PRETOKENIZERDEBUG
|
| 7005 |
-
|
| 7006 |
#endif
|
| 7007 |
it++;
|
| 7008 |
}
|
|
@@ -7018,7 +7022,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
| 7018 |
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
| 7019 |
|
| 7020 |
#ifdef PRETOKENIZERDEBUG
|
| 7021 |
-
|
| 7022 |
#endif
|
| 7023 |
|
| 7024 |
it++;
|
|
@@ -7034,7 +7038,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
| 7034 |
raw_text_base_length = right_reminder_length;
|
| 7035 |
|
| 7036 |
#ifdef PRETOKENIZERDEBUG
|
| 7037 |
-
|
| 7038 |
#endif
|
| 7039 |
} else {
|
| 7040 |
if (source == 0) {
|
|
@@ -7091,7 +7095,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 7091 |
}
|
| 7092 |
|
| 7093 |
#ifdef PRETOKENIZERDEBUG
|
| 7094 |
-
|
| 7095 |
#endif
|
| 7096 |
llm_tokenizer_spm tokenizer(vocab);
|
| 7097 |
llama_escape_whitespace(raw_text);
|
|
@@ -7112,7 +7116,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 7112 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 7113 |
|
| 7114 |
#ifdef PRETOKENIZERDEBUG
|
| 7115 |
-
|
| 7116 |
#endif
|
| 7117 |
llm_tokenizer_bpe tokenizer(vocab);
|
| 7118 |
tokenizer.tokenize(raw_text, output);
|
|
@@ -8429,9 +8433,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8429 |
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
| 8430 |
new_type = GGML_TYPE_Q8_0;
|
| 8431 |
}
|
|
|
|
|
|
|
|
|
|
| 8432 |
else if (new_type != GGML_TYPE_Q8_0) {
|
| 8433 |
new_type = GGML_TYPE_Q6_K;
|
| 8434 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8435 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 8436 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 8437 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
@@ -8462,13 +8480,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8462 |
new_type = GGML_TYPE_Q8_0;
|
| 8463 |
}
|
| 8464 |
} else if (name.find("ffn_down") != std::string::npos) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8465 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 8466 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
| 8467 |
-
if (
|
| 8468 |
}
|
| 8469 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 8470 |
-
new_type =
|
| 8471 |
-
: arch != LLM_ARCH_FALCON || use_more_bits(
|
| 8472 |
: GGML_TYPE_Q3_K;
|
| 8473 |
}
|
| 8474 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
|
@@ -8476,22 +8512,29 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8476 |
}
|
| 8477 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 8478 |
if (arch == LLM_ARCH_FALCON) {
|
| 8479 |
-
new_type =
|
| 8480 |
-
use_more_bits(
|
| 8481 |
} else {
|
| 8482 |
-
if (use_more_bits(
|
| 8483 |
}
|
| 8484 |
}
|
| 8485 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(
|
| 8486 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON &&
|
| 8487 |
new_type = GGML_TYPE_Q5_K;
|
| 8488 |
}
|
| 8489 |
++qs.i_feed_forward_w2;
|
| 8490 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 8491 |
if (arch != LLM_ARCH_FALCON) {
|
| 8492 |
-
if
|
| 8493 |
-
|
| 8494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8495 |
} else {
|
| 8496 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
| 8497 |
}
|
|
@@ -8594,6 +8637,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8594 |
if (params->only_copy) {
|
| 8595 |
ftype = model.ftype;
|
| 8596 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8597 |
|
| 8598 |
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
| 8599 |
struct gguf_context * ctx_out = gguf_init_empty();
|
|
@@ -8651,6 +8701,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8651 |
// placeholder for the meta data
|
| 8652 |
::zeros(fout, meta_size);
|
| 8653 |
|
|
|
|
|
|
|
| 8654 |
for (int i = 0; i < ml.n_tensors; ++i) {
|
| 8655 |
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
| 8656 |
|
|
@@ -8703,6 +8755,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8703 |
} else {
|
| 8704 |
const size_t nelements = ggml_nelements(tensor);
|
| 8705 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8706 |
float * f32_data;
|
| 8707 |
|
| 8708 |
if (tensor->type == GGML_TYPE_F32) {
|
|
@@ -8723,21 +8804,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8723 |
new_data = work.data();
|
| 8724 |
std::array<int64_t, 1 << 4> hist_cur = {};
|
| 8725 |
|
| 8726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8727 |
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
| 8728 |
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
| 8729 |
if (nthread_use < 2) {
|
| 8730 |
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0,
|
| 8731 |
} else {
|
| 8732 |
-
|
| 8733 |
new_size = 0;
|
| 8734 |
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data,
|
|
|
|
| 8735 |
std::array<int64_t, 1 << 4> local_hist = {};
|
|
|
|
| 8736 |
size_t local_size = 0;
|
| 8737 |
while (true) {
|
| 8738 |
std::unique_lock<std::mutex> lock(mutex);
|
| 8739 |
-
|
| 8740 |
-
if (
|
| 8741 |
if (local_size > 0) {
|
| 8742 |
for (int j=0; j<int(local_hist.size()); ++j) {
|
| 8743 |
hist_cur[j] += local_hist[j];
|
|
@@ -8747,8 +8835,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8747 |
break;
|
| 8748 |
}
|
| 8749 |
lock.unlock();
|
| 8750 |
-
|
| 8751 |
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
|
|
|
| 8752 |
}
|
| 8753 |
};
|
| 8754 |
for (int it = 0; it < nthread_use - 1; ++it) {
|
|
@@ -8759,7 +8848,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8759 |
workers.clear();
|
| 8760 |
}
|
| 8761 |
|
| 8762 |
-
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB
|
| 8763 |
int64_t tot_count = 0;
|
| 8764 |
for (size_t i = 0; i < hist_cur.size(); i++) {
|
| 8765 |
hist_all[i] += hist_cur[i];
|
|
@@ -8767,6 +8856,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8767 |
}
|
| 8768 |
|
| 8769 |
if (tot_count > 0) {
|
|
|
|
| 8770 |
for (size_t i = 0; i < hist_cur.size(); i++) {
|
| 8771 |
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
| 8772 |
}
|
|
@@ -8795,6 +8885,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8795 |
|
| 8796 |
fout.close();
|
| 8797 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8798 |
gguf_free(ctx_out);
|
| 8799 |
|
| 8800 |
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
|
@@ -9159,6 +9253,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
| 9159 |
/*.quantize_output_tensor =*/ true,
|
| 9160 |
/*.only_copy =*/ false,
|
| 9161 |
/*.pure =*/ false,
|
|
|
|
| 9162 |
};
|
| 9163 |
|
| 9164 |
return result;
|
|
|
|
| 987 |
}
|
| 988 |
|
| 989 |
if (prefetch > 0) {
|
| 990 |
+
#if _WIN32_WINNT >= 0x602
|
| 991 |
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
|
| 992 |
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
| 993 |
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
|
|
|
| 1005 |
llama_format_win_err(GetLastError()).c_str());
|
| 1006 |
}
|
| 1007 |
}
|
| 1008 |
+
#else
|
| 1009 |
+
throw std::runtime_error("PrefetchVirtualMemory unavailable");
|
| 1010 |
+
#endif
|
| 1011 |
}
|
| 1012 |
}
|
| 1013 |
|
|
|
|
| 1114 |
suggest = false;
|
| 1115 |
}
|
| 1116 |
|
| 1117 |
+
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
| 1118 |
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
| 1119 |
return false;
|
| 1120 |
}
|
|
|
|
| 1123 |
|
| 1124 |
static void raw_unlock(void * addr, size_t size) {
|
| 1125 |
if (munlock(addr, size)) {
|
| 1126 |
+
LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
| 1127 |
}
|
| 1128 |
}
|
| 1129 |
#elif defined(_WIN32)
|
|
|
|
| 1141 |
return true;
|
| 1142 |
}
|
| 1143 |
if (tries == 2) {
|
| 1144 |
+
LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
| 1145 |
len, size, llama_format_win_err(GetLastError()).c_str());
|
| 1146 |
return false;
|
| 1147 |
}
|
|
|
|
| 1150 |
// set size and try again.
|
| 1151 |
SIZE_T min_ws_size, max_ws_size;
|
| 1152 |
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
| 1153 |
+
LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
|
| 1154 |
llama_format_win_err(GetLastError()).c_str());
|
| 1155 |
return false;
|
| 1156 |
}
|
|
|
|
| 1163 |
min_ws_size += increment;
|
| 1164 |
max_ws_size += increment;
|
| 1165 |
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
| 1166 |
+
LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
|
| 1167 |
llama_format_win_err(GetLastError()).c_str());
|
| 1168 |
return false;
|
| 1169 |
}
|
|
|
|
| 1172 |
|
| 1173 |
static void raw_unlock(void * ptr, size_t len) {
|
| 1174 |
if (!VirtualUnlock(ptr, len)) {
|
| 1175 |
+
LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
|
| 1176 |
llama_format_win_err(GetLastError()).c_str());
|
| 1177 |
}
|
| 1178 |
}
|
|
|
|
| 1184 |
}
|
| 1185 |
|
| 1186 |
bool raw_lock(const void * addr, size_t len) const {
|
| 1187 |
+
LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
|
| 1188 |
return false;
|
| 1189 |
}
|
| 1190 |
|
|
|
|
| 2085 |
__func__, override_type_to_str(override->tag), override->key);
|
| 2086 |
switch (override->tag) {
|
| 2087 |
case LLAMA_KV_OVERRIDE_BOOL: {
|
| 2088 |
+
LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
|
| 2089 |
} break;
|
| 2090 |
case LLAMA_KV_OVERRIDE_INT: {
|
| 2091 |
+
LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
|
| 2092 |
} break;
|
| 2093 |
case LLAMA_KV_OVERRIDE_FLOAT: {
|
| 2094 |
+
LLAMA_LOG_INFO("%.6f\n", override->float_value);
|
| 2095 |
} break;
|
| 2096 |
default:
|
| 2097 |
// Shouldn't be possible to end up here, but just in case...
|
|
|
|
| 6993 |
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
| 6994 |
|
| 6995 |
#ifdef PRETOKENIZERDEBUG
|
| 6996 |
+
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
| 6997 |
#endif
|
| 6998 |
auto source = std::distance(buffer.begin(), it);
|
| 6999 |
|
|
|
|
| 7006 |
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
| 7007 |
|
| 7008 |
#ifdef PRETOKENIZERDEBUG
|
| 7009 |
+
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
| 7010 |
#endif
|
| 7011 |
it++;
|
| 7012 |
}
|
|
|
|
| 7022 |
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
| 7023 |
|
| 7024 |
#ifdef PRETOKENIZERDEBUG
|
| 7025 |
+
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
| 7026 |
#endif
|
| 7027 |
|
| 7028 |
it++;
|
|
|
|
| 7038 |
raw_text_base_length = right_reminder_length;
|
| 7039 |
|
| 7040 |
#ifdef PRETOKENIZERDEBUG
|
| 7041 |
+
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
| 7042 |
#endif
|
| 7043 |
} else {
|
| 7044 |
if (source == 0) {
|
|
|
|
| 7095 |
}
|
| 7096 |
|
| 7097 |
#ifdef PRETOKENIZERDEBUG
|
| 7098 |
+
LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 7099 |
#endif
|
| 7100 |
llm_tokenizer_spm tokenizer(vocab);
|
| 7101 |
llama_escape_whitespace(raw_text);
|
|
|
|
| 7116 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 7117 |
|
| 7118 |
#ifdef PRETOKENIZERDEBUG
|
| 7119 |
+
LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 7120 |
#endif
|
| 7121 |
llm_tokenizer_bpe tokenizer(vocab);
|
| 7122 |
tokenizer.tokenize(raw_text, output);
|
|
|
|
| 8433 |
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
| 8434 |
new_type = GGML_TYPE_Q8_0;
|
| 8435 |
}
|
| 8436 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 8437 |
+
new_type = GGML_TYPE_Q5_K;
|
| 8438 |
+
}
|
| 8439 |
else if (new_type != GGML_TYPE_Q8_0) {
|
| 8440 |
new_type = GGML_TYPE_Q6_K;
|
| 8441 |
}
|
| 8442 |
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 8443 |
+
if (name.find("attn_v.weight") != std::string::npos) {
|
| 8444 |
+
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
| 8445 |
+
else new_type = GGML_TYPE_Q2_K;
|
| 8446 |
+
++qs.i_attention_wv;
|
| 8447 |
+
}
|
| 8448 |
+
else if (name.find("ffn_down") != std::string::npos) {
|
| 8449 |
+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
|
| 8450 |
+
++qs.i_feed_forward_w2;
|
| 8451 |
+
}
|
| 8452 |
+
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
| 8453 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 8454 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 8455 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
|
|
| 8480 |
new_type = GGML_TYPE_Q8_0;
|
| 8481 |
}
|
| 8482 |
} else if (name.find("ffn_down") != std::string::npos) {
|
| 8483 |
+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
| 8484 |
+
int i_layer, n_layer;
|
| 8485 |
+
if (n_expert == 1) {
|
| 8486 |
+
i_layer = qs.i_feed_forward_w2;
|
| 8487 |
+
n_layer = qs.n_feed_forward_w2;
|
| 8488 |
+
} else {
|
| 8489 |
+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
| 8490 |
+
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
|
| 8491 |
+
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
| 8492 |
+
// tensor name.
|
| 8493 |
+
n_layer = qs.n_feed_forward_w2 / n_expert;
|
| 8494 |
+
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
|
| 8495 |
+
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
|
| 8496 |
+
}
|
| 8497 |
+
if (i_layer < 0 || i_layer >= n_layer) {
|
| 8498 |
+
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
|
| 8499 |
+
}
|
| 8500 |
+
}
|
| 8501 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 8502 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
| 8503 |
+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
| 8504 |
}
|
| 8505 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 8506 |
+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
| 8507 |
+
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
| 8508 |
: GGML_TYPE_Q3_K;
|
| 8509 |
}
|
| 8510 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
|
|
|
| 8512 |
}
|
| 8513 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 8514 |
if (arch == LLM_ARCH_FALCON) {
|
| 8515 |
+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
|
| 8516 |
+
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 8517 |
} else {
|
| 8518 |
+
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
| 8519 |
}
|
| 8520 |
}
|
| 8521 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
| 8522 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
| 8523 |
new_type = GGML_TYPE_Q5_K;
|
| 8524 |
}
|
| 8525 |
++qs.i_feed_forward_w2;
|
| 8526 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 8527 |
if (arch != LLM_ARCH_FALCON) {
|
| 8528 |
+
if (qs.model.hparams.n_expert == 8) {
|
| 8529 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
| 8530 |
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 8531 |
+
new_type = GGML_TYPE_Q5_K;
|
| 8532 |
+
}
|
| 8533 |
+
} else {
|
| 8534 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
| 8535 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
| 8536 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
| 8537 |
+
}
|
| 8538 |
} else {
|
| 8539 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
| 8540 |
}
|
|
|
|
| 8637 |
if (params->only_copy) {
|
| 8638 |
ftype = model.ftype;
|
| 8639 |
}
|
| 8640 |
+
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
| 8641 |
+
if (params->imatrix) {
|
| 8642 |
+
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
| 8643 |
+
if (imatrix_data) {
|
| 8644 |
+
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
| 8645 |
+
}
|
| 8646 |
+
}
|
| 8647 |
|
| 8648 |
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
| 8649 |
struct gguf_context * ctx_out = gguf_init_empty();
|
|
|
|
| 8701 |
// placeholder for the meta data
|
| 8702 |
::zeros(fout, meta_size);
|
| 8703 |
|
| 8704 |
+
std::set<ggml_type> used_iq2;
|
| 8705 |
+
|
| 8706 |
for (int i = 0; i < ml.n_tensors; ++i) {
|
| 8707 |
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
| 8708 |
|
|
|
|
| 8755 |
} else {
|
| 8756 |
const size_t nelements = ggml_nelements(tensor);
|
| 8757 |
|
| 8758 |
+
if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
|
| 8759 |
+
ggml_init_iq2_quantization(new_type);
|
| 8760 |
+
used_iq2.insert(new_type);
|
| 8761 |
+
}
|
| 8762 |
+
|
| 8763 |
+
const float * imatrix = nullptr;
|
| 8764 |
+
if (imatrix_data) {
|
| 8765 |
+
auto it = imatrix_data->find(tensor->name);
|
| 8766 |
+
if (it == imatrix_data->end()) {
|
| 8767 |
+
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
| 8768 |
+
} else {
|
| 8769 |
+
if (it->second.size() == (size_t)tensor->ne[0]) {
|
| 8770 |
+
imatrix = it->second.data();
|
| 8771 |
+
} else {
|
| 8772 |
+
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
| 8773 |
+
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
| 8774 |
+
}
|
| 8775 |
+
}
|
| 8776 |
+
}
|
| 8777 |
+
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
| 8778 |
+
new_type == GGML_TYPE_IQ2_XS ||
|
| 8779 |
+
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
| 8780 |
+
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
| 8781 |
+
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
| 8782 |
+
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
| 8783 |
+
LLAMA_LOG_ERROR("============================================================\n\n");
|
| 8784 |
+
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
| 8785 |
+
}
|
| 8786 |
+
|
| 8787 |
float * f32_data;
|
| 8788 |
|
| 8789 |
if (tensor->type == GGML_TYPE_F32) {
|
|
|
|
| 8804 |
new_data = work.data();
|
| 8805 |
std::array<int64_t, 1 << 4> hist_cur = {};
|
| 8806 |
|
| 8807 |
+
const int n_per_row = tensor->ne[0];
|
| 8808 |
+
const int nrows = nelements / n_per_row;
|
| 8809 |
+
|
| 8810 |
+
static const int min_chunk_size = 32 * 512;
|
| 8811 |
+
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
| 8812 |
+
|
| 8813 |
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
| 8814 |
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
| 8815 |
if (nthread_use < 2) {
|
| 8816 |
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
| 8817 |
} else {
|
| 8818 |
+
int counter = 0;
|
| 8819 |
new_size = 0;
|
| 8820 |
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
| 8821 |
+
nrows, n_per_row, imatrix]() {
|
| 8822 |
std::array<int64_t, 1 << 4> local_hist = {};
|
| 8823 |
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
| 8824 |
size_t local_size = 0;
|
| 8825 |
while (true) {
|
| 8826 |
std::unique_lock<std::mutex> lock(mutex);
|
| 8827 |
+
int first_row = counter; counter += nrows_per_chunk;
|
| 8828 |
+
if (first_row >= nrows) {
|
| 8829 |
if (local_size > 0) {
|
| 8830 |
for (int j=0; j<int(local_hist.size()); ++j) {
|
| 8831 |
hist_cur[j] += local_hist[j];
|
|
|
|
| 8835 |
break;
|
| 8836 |
}
|
| 8837 |
lock.unlock();
|
| 8838 |
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
| 8839 |
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
| 8840 |
+
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
| 8841 |
}
|
| 8842 |
};
|
| 8843 |
for (int it = 0; it < nthread_use - 1; ++it) {
|
|
|
|
| 8848 |
workers.clear();
|
| 8849 |
}
|
| 8850 |
|
| 8851 |
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
| 8852 |
int64_t tot_count = 0;
|
| 8853 |
for (size_t i = 0; i < hist_cur.size(); i++) {
|
| 8854 |
hist_all[i] += hist_cur[i];
|
|
|
|
| 8856 |
}
|
| 8857 |
|
| 8858 |
if (tot_count > 0) {
|
| 8859 |
+
LLAMA_LOG_INFO(" | hist: ");
|
| 8860 |
for (size_t i = 0; i < hist_cur.size(); i++) {
|
| 8861 |
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
| 8862 |
}
|
|
|
|
| 8885 |
|
| 8886 |
fout.close();
|
| 8887 |
|
| 8888 |
+
for (auto type : used_iq2) {
|
| 8889 |
+
ggml_deinit_iq2_quantization(type);
|
| 8890 |
+
}
|
| 8891 |
+
|
| 8892 |
gguf_free(ctx_out);
|
| 8893 |
|
| 8894 |
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
|
|
|
| 9253 |
/*.quantize_output_tensor =*/ true,
|
| 9254 |
/*.only_copy =*/ false,
|
| 9255 |
/*.pure =*/ false,
|
| 9256 |
+
/*.imatrix =*/ nullptr,
|
| 9257 |
};
|
| 9258 |
|
| 9259 |
return result;
|
examples/talk-llama/llama.h
CHANGED
|
@@ -249,6 +249,7 @@ extern "C" {
|
|
| 249 |
bool quantize_output_tensor; // quantize output.weight
|
| 250 |
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
| 251 |
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
|
|
|
| 252 |
} llama_model_quantize_params;
|
| 253 |
|
| 254 |
// grammar types
|
|
|
|
| 249 |
bool quantize_output_tensor; // quantize output.weight
|
| 250 |
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
| 251 |
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
| 252 |
+
void * imatrix; // pointer to importance matrix data
|
| 253 |
} llama_model_quantize_params;
|
| 254 |
|
| 255 |
// grammar types
|