Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +71 -22
- examples/talk-llama/llama.h +14 -0
examples/talk-llama/llama.cpp
CHANGED
|
@@ -1903,6 +1903,28 @@ static void llama_kv_cache_seq_shift(
|
|
| 1903 |
cache.head = new_head != cache.size ? new_head : 0;
|
| 1904 |
}
|
| 1905 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1906 |
//
|
| 1907 |
// model loading and saving
|
| 1908 |
//
|
|
@@ -2180,7 +2202,11 @@ struct llama_model_loader {
|
|
| 2180 |
type_max = type;
|
| 2181 |
}
|
| 2182 |
|
| 2183 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2184 |
}
|
| 2185 |
|
| 2186 |
switch (type_max) {
|
|
@@ -2196,6 +2222,8 @@ struct llama_model_loader {
|
|
| 2196 |
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
| 2197 |
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
| 2198 |
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
|
|
|
|
|
|
| 2199 |
default:
|
| 2200 |
{
|
| 2201 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -2558,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2558 |
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
| 2559 |
|
| 2560 |
// K-quants
|
| 2561 |
-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
|
|
|
| 2562 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
| 2563 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
| 2564 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
|
@@ -2567,6 +2596,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2567 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
| 2568 |
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
| 2569 |
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
|
|
|
|
|
|
| 2570 |
|
| 2571 |
default: return "unknown, may not work";
|
| 2572 |
}
|
|
@@ -2801,6 +2832,7 @@ static void llm_load_hparams(
|
|
| 2801 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 2802 |
|
| 2803 |
switch (hparams.n_layer) {
|
|
|
|
| 2804 |
case 32: model.type = e_model::MODEL_3B; break;
|
| 2805 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2806 |
}
|
|
@@ -3117,7 +3149,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
| 3117 |
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
| 3118 |
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
| 3119 |
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
| 3120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3121 |
if (ml.n_bytes < GiB) {
|
| 3122 |
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
| 3123 |
} else {
|
|
@@ -4772,7 +4812,6 @@ struct llm_build_context {
|
|
| 4772 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 4773 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 4774 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 4775 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 4776 |
|
| 4777 |
struct ggml_tensor * cur;
|
| 4778 |
struct ggml_tensor * inpL;
|
|
@@ -4896,7 +4935,6 @@ struct llm_build_context {
|
|
| 4896 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 4897 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 4898 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 4899 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 4900 |
|
| 4901 |
struct ggml_tensor * cur;
|
| 4902 |
struct ggml_tensor * pos;
|
|
@@ -4995,9 +5033,7 @@ struct llm_build_context {
|
|
| 4995 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 4996 |
|
| 4997 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 4998 |
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 4999 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5000 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 5001 |
|
| 5002 |
const int64_t n_rot = n_embd_head_k / 2;
|
| 5003 |
|
|
@@ -5209,9 +5245,7 @@ struct llm_build_context {
|
|
| 5209 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5210 |
|
| 5211 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5212 |
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5213 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5214 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 5215 |
|
| 5216 |
struct ggml_tensor * cur;
|
| 5217 |
struct ggml_tensor * inpL;
|
|
@@ -5304,7 +5338,6 @@ struct llm_build_context {
|
|
| 5304 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5305 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5306 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5307 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 5308 |
|
| 5309 |
struct ggml_tensor * cur;
|
| 5310 |
struct ggml_tensor * inpL;
|
|
@@ -5400,7 +5433,6 @@ struct llm_build_context {
|
|
| 5400 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5401 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5402 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5403 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 5404 |
|
| 5405 |
struct ggml_tensor * cur;
|
| 5406 |
struct ggml_tensor * inpL;
|
|
@@ -5727,7 +5759,6 @@ struct llm_build_context {
|
|
| 5727 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5728 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5729 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5730 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 5731 |
|
| 5732 |
struct ggml_tensor * cur;
|
| 5733 |
struct ggml_tensor * attn_norm_output;
|
|
@@ -5951,7 +5982,6 @@ struct llm_build_context {
|
|
| 5951 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5952 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5953 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5954 |
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
| 5955 |
|
| 5956 |
struct ggml_tensor * cur;
|
| 5957 |
struct ggml_tensor * pos;
|
|
@@ -8926,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8926 |
// TODO: explore better strategies
|
| 8927 |
new_type = GGML_TYPE_Q8_0;
|
| 8928 |
}
|
| 8929 |
-
} else if (name.find("ffn_down
|
| 8930 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
|
|
|
|
|
|
|
|
| 8931 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 8932 |
-
new_type = qs.i_feed_forward_w2 <
|
| 8933 |
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
| 8934 |
: GGML_TYPE_Q3_K;
|
| 8935 |
}
|
|
@@ -8938,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8938 |
}
|
| 8939 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 8940 |
if (arch == LLM_ARCH_FALCON) {
|
| 8941 |
-
new_type = qs.i_feed_forward_w2 <
|
| 8942 |
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 8943 |
} else {
|
| 8944 |
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
| 8945 |
}
|
| 8946 |
}
|
| 8947 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
| 8948 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 <
|
| 8949 |
new_type = GGML_TYPE_Q5_K;
|
| 8950 |
}
|
| 8951 |
++qs.i_feed_forward_w2;
|
|
@@ -8963,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8963 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
| 8964 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
| 8965 |
}
|
| 8966 |
-
|
| 8967 |
-
|
| 8968 |
-
|
|
|
|
| 8969 |
// This can be used to reduce the size of the Q5_K_S model.
|
| 8970 |
// The associated PPL increase is fully in line with the size reduction
|
| 8971 |
//else {
|
|
@@ -9014,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 9014 |
|
| 9015 |
// K-quants
|
| 9016 |
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
|
|
|
| 9017 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 9018 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 9019 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
@@ -9022,6 +9057,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 9022 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
| 9023 |
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
| 9024 |
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
|
|
|
|
|
|
| 9025 |
|
| 9026 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 9027 |
}
|
|
@@ -9070,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 9070 |
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
| 9071 |
++qs.n_attention_wv;
|
| 9072 |
}
|
| 9073 |
-
else if (name.find("ffn_down
|
| 9074 |
++qs.n_feed_forward_w2;
|
| 9075 |
}
|
| 9076 |
}
|
|
@@ -10146,9 +10183,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
| 10146 |
}
|
| 10147 |
|
| 10148 |
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10149 |
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
| 10150 |
}
|
| 10151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10152 |
// Returns the *maximum* size of the state
|
| 10153 |
size_t llama_get_state_size(const struct llama_context * ctx) {
|
| 10154 |
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
|
@@ -10881,7 +10930,7 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
| 10881 |
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
| 10882 |
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
| 10883 |
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
| 10884 |
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
| 10885 |
}
|
| 10886 |
|
| 10887 |
void llama_reset_timings(struct llama_context * ctx) {
|
|
|
|
| 1903 |
cache.head = new_head != cache.size ? new_head : 0;
|
| 1904 |
}
|
| 1905 |
|
| 1906 |
+
static void llama_kv_cache_seq_div(
|
| 1907 |
+
struct llama_kv_cache & cache,
|
| 1908 |
+
llama_seq_id seq_id,
|
| 1909 |
+
llama_pos p0,
|
| 1910 |
+
llama_pos p1,
|
| 1911 |
+
int d) {
|
| 1912 |
+
if (p0 < 0) p0 = 0;
|
| 1913 |
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
| 1914 |
+
|
| 1915 |
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
| 1916 |
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
| 1917 |
+
cache.has_shift = true;
|
| 1918 |
+
|
| 1919 |
+
{
|
| 1920 |
+
llama_pos p_old = cache.cells[i].pos;
|
| 1921 |
+
cache.cells[i].pos /= d;
|
| 1922 |
+
cache.cells[i].delta += cache.cells[i].pos - p_old;
|
| 1923 |
+
}
|
| 1924 |
+
}
|
| 1925 |
+
}
|
| 1926 |
+
}
|
| 1927 |
+
|
| 1928 |
//
|
| 1929 |
// model loading and saving
|
| 1930 |
//
|
|
|
|
| 2202 |
type_max = type;
|
| 2203 |
}
|
| 2204 |
|
| 2205 |
+
// TODO: make runtime configurable
|
| 2206 |
+
#if 0
|
| 2207 |
+
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
| 2208 |
+
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
|
| 2209 |
+
#endif
|
| 2210 |
}
|
| 2211 |
|
| 2212 |
switch (type_max) {
|
|
|
|
| 2222 |
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
| 2223 |
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
| 2224 |
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
| 2225 |
+
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
| 2226 |
+
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2227 |
default:
|
| 2228 |
{
|
| 2229 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
|
|
| 2586 |
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
| 2587 |
|
| 2588 |
// K-quants
|
| 2589 |
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
| 2590 |
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
| 2591 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
| 2592 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
| 2593 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
|
|
|
| 2596 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
| 2597 |
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
| 2598 |
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
| 2599 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
| 2600 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
| 2601 |
|
| 2602 |
default: return "unknown, may not work";
|
| 2603 |
}
|
|
|
|
| 2832 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 2833 |
|
| 2834 |
switch (hparams.n_layer) {
|
| 2835 |
+
case 24: model.type = e_model::MODEL_1B; break;
|
| 2836 |
case 32: model.type = e_model::MODEL_3B; break;
|
| 2837 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2838 |
}
|
|
|
|
| 3149 |
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
| 3150 |
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
| 3151 |
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
| 3152 |
+
if (ml.n_elements >= 1e12) {
|
| 3153 |
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
|
| 3154 |
+
} else if (ml.n_elements >= 1e9) {
|
| 3155 |
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
| 3156 |
+
} else if (ml.n_elements >= 1e6) {
|
| 3157 |
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
|
| 3158 |
+
} else {
|
| 3159 |
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
|
| 3160 |
+
}
|
| 3161 |
if (ml.n_bytes < GiB) {
|
| 3162 |
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
| 3163 |
} else {
|
|
|
|
| 4812 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 4813 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 4814 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 4815 |
|
| 4816 |
struct ggml_tensor * cur;
|
| 4817 |
struct ggml_tensor * inpL;
|
|
|
|
| 4935 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 4936 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 4937 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 4938 |
|
| 4939 |
struct ggml_tensor * cur;
|
| 4940 |
struct ggml_tensor * pos;
|
|
|
|
| 5033 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5034 |
|
| 5035 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
|
|
| 5036 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 5037 |
|
| 5038 |
const int64_t n_rot = n_embd_head_k / 2;
|
| 5039 |
|
|
|
|
| 5245 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5246 |
|
| 5247 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
|
|
| 5248 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 5249 |
|
| 5250 |
struct ggml_tensor * cur;
|
| 5251 |
struct ggml_tensor * inpL;
|
|
|
|
| 5338 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5339 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5340 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 5341 |
|
| 5342 |
struct ggml_tensor * cur;
|
| 5343 |
struct ggml_tensor * inpL;
|
|
|
|
| 5433 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5434 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5435 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 5436 |
|
| 5437 |
struct ggml_tensor * cur;
|
| 5438 |
struct ggml_tensor * inpL;
|
|
|
|
| 5759 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5760 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5761 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 5762 |
|
| 5763 |
struct ggml_tensor * cur;
|
| 5764 |
struct ggml_tensor * attn_norm_output;
|
|
|
|
| 5982 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5983 |
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5984 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 5985 |
|
| 5986 |
struct ggml_tensor * cur;
|
| 5987 |
struct ggml_tensor * pos;
|
|
|
|
| 8956 |
// TODO: explore better strategies
|
| 8957 |
new_type = GGML_TYPE_Q8_0;
|
| 8958 |
}
|
| 8959 |
+
} else if (name.find("ffn_down") != std::string::npos) {
|
| 8960 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 8961 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
| 8962 |
+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
|
| 8963 |
+
}
|
| 8964 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 8965 |
+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
|
| 8966 |
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
| 8967 |
: GGML_TYPE_Q3_K;
|
| 8968 |
}
|
|
|
|
| 8971 |
}
|
| 8972 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 8973 |
if (arch == LLM_ARCH_FALCON) {
|
| 8974 |
+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
|
| 8975 |
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 8976 |
} else {
|
| 8977 |
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
| 8978 |
}
|
| 8979 |
}
|
| 8980 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
| 8981 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
|
| 8982 |
new_type = GGML_TYPE_Q5_K;
|
| 8983 |
}
|
| 8984 |
++qs.i_feed_forward_w2;
|
|
|
|
| 8996 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
| 8997 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
| 8998 |
}
|
| 8999 |
+
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
| 9000 |
+
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
| 9001 |
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 9002 |
+
//}
|
| 9003 |
// This can be used to reduce the size of the Q5_K_S model.
|
| 9004 |
// The associated PPL increase is fully in line with the size reduction
|
| 9005 |
//else {
|
|
|
|
| 9048 |
|
| 9049 |
// K-quants
|
| 9050 |
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
| 9051 |
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
|
| 9052 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 9053 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 9054 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
|
|
| 9057 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
| 9058 |
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
| 9059 |
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
| 9060 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
| 9061 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
| 9062 |
|
| 9063 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 9064 |
}
|
|
|
|
| 9107 |
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
| 9108 |
++qs.n_attention_wv;
|
| 9109 |
}
|
| 9110 |
+
else if (name.find("ffn_down") != std::string::npos) {
|
| 9111 |
++qs.n_feed_forward_w2;
|
| 9112 |
}
|
| 9113 |
}
|
|
|
|
| 10183 |
}
|
| 10184 |
|
| 10185 |
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
| 10186 |
+
if (delta == 0) {
|
| 10187 |
+
return;
|
| 10188 |
+
}
|
| 10189 |
+
|
| 10190 |
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
| 10191 |
}
|
| 10192 |
|
| 10193 |
+
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
| 10194 |
+
if (d == 1) {
|
| 10195 |
+
return;
|
| 10196 |
+
}
|
| 10197 |
+
|
| 10198 |
+
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
| 10199 |
+
}
|
| 10200 |
+
|
| 10201 |
// Returns the *maximum* size of the state
|
| 10202 |
size_t llama_get_state_size(const struct llama_context * ctx) {
|
| 10203 |
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
|
|
|
| 10930 |
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
| 10931 |
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
| 10932 |
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
| 10933 |
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
| 10934 |
}
|
| 10935 |
|
| 10936 |
void llama_reset_timings(struct llama_context * ctx) {
|
examples/talk-llama/llama.h
CHANGED
|
@@ -103,6 +103,9 @@ extern "C" {
|
|
| 103 |
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
| 104 |
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
| 105 |
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 108 |
};
|
|
@@ -484,6 +487,17 @@ extern "C" {
|
|
| 484 |
llama_pos p1,
|
| 485 |
llama_pos delta);
|
| 486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
//
|
| 488 |
// State / sessions
|
| 489 |
//
|
|
|
|
| 103 |
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
| 104 |
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
| 105 |
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
| 106 |
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
| 107 |
+
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
| 108 |
+
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
| 109 |
|
| 110 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 111 |
};
|
|
|
|
| 487 |
llama_pos p1,
|
| 488 |
llama_pos delta);
|
| 489 |
|
| 490 |
+
// Integer division of the positions by factor of `d > 1`
|
| 491 |
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
| 492 |
+
// p0 < 0 : [0, p1]
|
| 493 |
+
// p1 < 0 : [p0, inf)
|
| 494 |
+
LLAMA_API void llama_kv_cache_seq_div(
|
| 495 |
+
struct llama_context * ctx,
|
| 496 |
+
llama_seq_id seq_id,
|
| 497 |
+
llama_pos p0,
|
| 498 |
+
llama_pos p1,
|
| 499 |
+
int d);
|
| 500 |
+
|
| 501 |
//
|
| 502 |
// State / sessions
|
| 503 |
//
|