Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama-arch.cpp +127 -0
- examples/talk-llama/llama-arch.h +13 -0
- examples/talk-llama/llama-batch.cpp +2 -2
- examples/talk-llama/llama-chat.cpp +34 -4
- examples/talk-llama/llama-chat.h +2 -0
- examples/talk-llama/llama-context.cpp +48 -27
- examples/talk-llama/llama-context.h +10 -6
- examples/talk-llama/llama-graph.cpp +175 -54
- examples/talk-llama/llama-graph.h +69 -20
- examples/talk-llama/llama-hparams.cpp +9 -3
- examples/talk-llama/llama-hparams.h +15 -6
- examples/talk-llama/llama-kv-cache-unified-iswa.cpp +12 -6
- examples/talk-llama/llama-kv-cache-unified-iswa.h +2 -2
- examples/talk-llama/llama-kv-cache-unified.cpp +95 -71
- examples/talk-llama/llama-kv-cache-unified.h +3 -3
- examples/talk-llama/llama-memory-hybrid.cpp +8 -3
- examples/talk-llama/llama-memory-hybrid.h +3 -2
- examples/talk-llama/llama-memory-recurrent.cpp +6 -2
- examples/talk-llama/llama-memory-recurrent.h +2 -2
- examples/talk-llama/llama-memory.h +2 -2
- examples/talk-llama/llama-model-loader.cpp +1 -0
- examples/talk-llama/llama-model-loader.h +3 -2
- examples/talk-llama/llama-model.cpp +1114 -53
- examples/talk-llama/llama-model.h +25 -4
- examples/talk-llama/llama-quant.cpp +40 -4
- examples/talk-llama/llama-vocab.cpp +50 -1
- examples/talk-llama/llama-vocab.h +1 -0
- examples/talk-llama/llama.h +34 -4
examples/talk-llama/llama-arch.cpp
CHANGED
|
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 62 |
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
| 63 |
{ LLM_ARCH_CHATGLM, "chatglm" },
|
| 64 |
{ LLM_ARCH_GLM4, "glm4" },
|
|
|
|
| 65 |
{ LLM_ARCH_BITNET, "bitnet" },
|
| 66 |
{ LLM_ARCH_T5, "t5" },
|
| 67 |
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
@@ -85,9 +86,13 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 85 |
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
| 86 |
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
| 87 |
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
|
|
| 88 |
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
|
|
| 89 |
{ LLM_ARCH_LFM2, "lfm2" },
|
| 90 |
{ LLM_ARCH_DREAM, "dream" },
|
|
|
|
|
|
|
| 91 |
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
| 92 |
};
|
| 93 |
|
|
@@ -124,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 124 |
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
| 125 |
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
| 126 |
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
|
|
| 127 |
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
| 128 |
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
| 129 |
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -1388,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1388 |
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
| 1389 |
},
|
| 1390 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1391 |
{
|
| 1392 |
LLM_ARCH_BITNET,
|
| 1393 |
{
|
|
@@ -1895,6 +1935,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1895 |
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1896 |
},
|
| 1897 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1898 |
{
|
| 1899 |
LLM_ARCH_SMOLLM3,
|
| 1900 |
{
|
|
@@ -1912,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1912 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1913 |
},
|
| 1914 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1915 |
{
|
| 1916 |
LLM_ARCH_LFM2,
|
| 1917 |
{
|
|
@@ -1933,6 +2012,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1933 |
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
| 1934 |
}
|
| 1935 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1936 |
{
|
| 1937 |
LLM_ARCH_DREAM,
|
| 1938 |
{
|
|
@@ -1950,6 +2050,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1950 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1951 |
},
|
| 1952 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1953 |
{
|
| 1954 |
LLM_ARCH_UNKNOWN,
|
| 1955 |
{
|
|
@@ -1989,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
| 1989 |
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 1990 |
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 1991 |
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
|
|
| 1992 |
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 1993 |
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 1994 |
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -2120,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
| 2120 |
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
| 2121 |
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2122 |
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2123 |
};
|
| 2124 |
|
| 2125 |
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
@@ -2202,6 +2328,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
| 2202 |
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
| 2203 |
switch (arch) {
|
| 2204 |
case LLM_ARCH_DREAM:
|
|
|
|
| 2205 |
return true;
|
| 2206 |
default:
|
| 2207 |
return false;
|
|
|
|
| 62 |
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
| 63 |
{ LLM_ARCH_CHATGLM, "chatglm" },
|
| 64 |
{ LLM_ARCH_GLM4, "glm4" },
|
| 65 |
+
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
|
| 66 |
{ LLM_ARCH_BITNET, "bitnet" },
|
| 67 |
{ LLM_ARCH_T5, "t5" },
|
| 68 |
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
|
|
| 86 |
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
| 87 |
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
| 88 |
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
| 89 |
+
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
| 90 |
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
| 91 |
+
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
|
| 92 |
{ LLM_ARCH_LFM2, "lfm2" },
|
| 93 |
{ LLM_ARCH_DREAM, "dream" },
|
| 94 |
+
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
| 95 |
+
{ LLM_ARCH_LLADA, "llada" },
|
| 96 |
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
| 97 |
};
|
| 98 |
|
|
|
|
| 129 |
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
| 130 |
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
| 131 |
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
| 132 |
+
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
| 133 |
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
| 134 |
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
| 135 |
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
|
|
| 1394 |
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
| 1395 |
},
|
| 1396 |
},
|
| 1397 |
+
{
|
| 1398 |
+
LLM_ARCH_GLM4_MOE,
|
| 1399 |
+
{
|
| 1400 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1401 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1402 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1403 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1404 |
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
| 1405 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1406 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 1407 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 1408 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 1409 |
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
| 1410 |
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
| 1411 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 1412 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 1413 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1414 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 1415 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 1416 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 1417 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1418 |
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
| 1419 |
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
| 1420 |
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
| 1421 |
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
| 1422 |
+
// NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
|
| 1423 |
+
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
|
| 1424 |
+
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
|
| 1425 |
+
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
|
| 1426 |
+
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
|
| 1427 |
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
|
| 1428 |
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
|
| 1429 |
+
},
|
| 1430 |
+
},
|
| 1431 |
{
|
| 1432 |
LLM_ARCH_BITNET,
|
| 1433 |
{
|
|
|
|
| 1935 |
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1936 |
},
|
| 1937 |
},
|
| 1938 |
+
{
|
| 1939 |
+
LLM_ARCH_HUNYUAN_DENSE,
|
| 1940 |
+
{
|
| 1941 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1942 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1943 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1944 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1945 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1946 |
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
| 1947 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 1948 |
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
| 1949 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 1950 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 1951 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 1952 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 1953 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 1954 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1955 |
+
|
| 1956 |
+
},
|
| 1957 |
+
},
|
| 1958 |
{
|
| 1959 |
LLM_ARCH_SMOLLM3,
|
| 1960 |
{
|
|
|
|
| 1972 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1973 |
},
|
| 1974 |
},
|
| 1975 |
+
{
|
| 1976 |
+
LLM_ARCH_OPENAI_MOE,
|
| 1977 |
+
{
|
| 1978 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1979 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1980 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1981 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1982 |
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
| 1983 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1984 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 1985 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 1986 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 1987 |
+
{ LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
|
| 1988 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 1989 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 1990 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 1991 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1992 |
+
},
|
| 1993 |
+
},
|
| 1994 |
{
|
| 1995 |
LLM_ARCH_LFM2,
|
| 1996 |
{
|
|
|
|
| 2012 |
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
| 2013 |
}
|
| 2014 |
},
|
| 2015 |
+
{
|
| 2016 |
+
LLM_ARCH_SMALLTHINKER,
|
| 2017 |
+
{
|
| 2018 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 2019 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 2020 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 2021 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 2022 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 2023 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 2024 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 2025 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 2026 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 2027 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 2028 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 2029 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 2030 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 2031 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 2032 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 2033 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
|
| 2034 |
+
},
|
| 2035 |
+
},
|
| 2036 |
{
|
| 2037 |
LLM_ARCH_DREAM,
|
| 2038 |
{
|
|
|
|
| 2050 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 2051 |
},
|
| 2052 |
},
|
| 2053 |
+
{
|
| 2054 |
+
LLM_ARCH_LLADA,
|
| 2055 |
+
{
|
| 2056 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 2057 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 2058 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 2059 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 2060 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 2061 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 2062 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 2063 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 2064 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 2065 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 2066 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 2067 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 2068 |
+
},
|
| 2069 |
+
},
|
| 2070 |
{
|
| 2071 |
LLM_ARCH_UNKNOWN,
|
| 2072 |
{
|
|
|
|
| 2106 |
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2107 |
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2108 |
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2109 |
+
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
|
| 2110 |
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2111 |
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2112 |
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
|
|
| 2238 |
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
| 2239 |
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2240 |
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
| 2241 |
+
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
| 2242 |
+
// These tensors only exist in the last layer(s) and are treated as output tensors
|
| 2243 |
+
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
| 2244 |
+
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
| 2245 |
+
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
| 2246 |
+
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
| 2247 |
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
| 2248 |
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
| 2249 |
};
|
| 2250 |
|
| 2251 |
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
|
|
| 2328 |
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
| 2329 |
switch (arch) {
|
| 2330 |
case LLM_ARCH_DREAM:
|
| 2331 |
+
case LLM_ARCH_LLADA:
|
| 2332 |
return true;
|
| 2333 |
default:
|
| 2334 |
return false;
|
examples/talk-llama/llama-arch.h
CHANGED
|
@@ -66,6 +66,7 @@ enum llm_arch {
|
|
| 66 |
LLM_ARCH_DEEPSEEK2,
|
| 67 |
LLM_ARCH_CHATGLM,
|
| 68 |
LLM_ARCH_GLM4,
|
|
|
|
| 69 |
LLM_ARCH_BITNET,
|
| 70 |
LLM_ARCH_T5,
|
| 71 |
LLM_ARCH_T5ENCODER,
|
|
@@ -89,9 +90,13 @@ enum llm_arch {
|
|
| 89 |
LLM_ARCH_ERNIE4_5,
|
| 90 |
LLM_ARCH_ERNIE4_5_MOE,
|
| 91 |
LLM_ARCH_HUNYUAN_MOE,
|
|
|
|
| 92 |
LLM_ARCH_SMOLLM3,
|
|
|
|
| 93 |
LLM_ARCH_LFM2,
|
| 94 |
LLM_ARCH_DREAM,
|
|
|
|
|
|
|
| 95 |
LLM_ARCH_UNKNOWN,
|
| 96 |
};
|
| 97 |
|
|
@@ -128,6 +133,7 @@ enum llm_kv {
|
|
| 128 |
LLM_KV_EXPERT_WEIGHTS_NORM,
|
| 129 |
LLM_KV_EXPERT_GATING_FUNC,
|
| 130 |
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
|
|
| 131 |
LLM_KV_POOLING_TYPE,
|
| 132 |
LLM_KV_LOGIT_SCALE,
|
| 133 |
LLM_KV_DECODER_START_TOKEN_ID,
|
|
@@ -260,6 +266,7 @@ enum llm_tensor {
|
|
| 260 |
LLM_TENSOR_ATTN_OUT_NORM,
|
| 261 |
LLM_TENSOR_ATTN_POST_NORM,
|
| 262 |
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
|
|
| 263 |
LLM_TENSOR_FFN_GATE_INP,
|
| 264 |
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
| 265 |
LLM_TENSOR_FFN_NORM,
|
|
@@ -406,6 +413,12 @@ enum llm_tensor {
|
|
| 406 |
LLM_TENSOR_SHORTCONV_CONV,
|
| 407 |
LLM_TENSOR_SHORTCONV_INPROJ,
|
| 408 |
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
};
|
| 410 |
|
| 411 |
enum llm_tensor_layer {
|
|
|
|
| 66 |
LLM_ARCH_DEEPSEEK2,
|
| 67 |
LLM_ARCH_CHATGLM,
|
| 68 |
LLM_ARCH_GLM4,
|
| 69 |
+
LLM_ARCH_GLM4_MOE,
|
| 70 |
LLM_ARCH_BITNET,
|
| 71 |
LLM_ARCH_T5,
|
| 72 |
LLM_ARCH_T5ENCODER,
|
|
|
|
| 90 |
LLM_ARCH_ERNIE4_5,
|
| 91 |
LLM_ARCH_ERNIE4_5_MOE,
|
| 92 |
LLM_ARCH_HUNYUAN_MOE,
|
| 93 |
+
LLM_ARCH_HUNYUAN_DENSE,
|
| 94 |
LLM_ARCH_SMOLLM3,
|
| 95 |
+
LLM_ARCH_OPENAI_MOE,
|
| 96 |
LLM_ARCH_LFM2,
|
| 97 |
LLM_ARCH_DREAM,
|
| 98 |
+
LLM_ARCH_SMALLTHINKER,
|
| 99 |
+
LLM_ARCH_LLADA,
|
| 100 |
LLM_ARCH_UNKNOWN,
|
| 101 |
};
|
| 102 |
|
|
|
|
| 133 |
LLM_KV_EXPERT_WEIGHTS_NORM,
|
| 134 |
LLM_KV_EXPERT_GATING_FUNC,
|
| 135 |
LLM_KV_MOE_EVERY_N_LAYERS,
|
| 136 |
+
LLM_KV_NEXTN_PREDICT_LAYERS,
|
| 137 |
LLM_KV_POOLING_TYPE,
|
| 138 |
LLM_KV_LOGIT_SCALE,
|
| 139 |
LLM_KV_DECODER_START_TOKEN_ID,
|
|
|
|
| 266 |
LLM_TENSOR_ATTN_OUT_NORM,
|
| 267 |
LLM_TENSOR_ATTN_POST_NORM,
|
| 268 |
LLM_TENSOR_ATTN_ROT_EMBD,
|
| 269 |
+
LLM_TENSOR_ATTN_SINKS,
|
| 270 |
LLM_TENSOR_FFN_GATE_INP,
|
| 271 |
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
| 272 |
LLM_TENSOR_FFN_NORM,
|
|
|
|
| 413 |
LLM_TENSOR_SHORTCONV_CONV,
|
| 414 |
LLM_TENSOR_SHORTCONV_INPROJ,
|
| 415 |
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
| 416 |
+
LLM_TENSOR_NEXTN_EH_PROJ,
|
| 417 |
+
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
| 418 |
+
LLM_TENSOR_NEXTN_ENORM,
|
| 419 |
+
LLM_TENSOR_NEXTN_HNORM,
|
| 420 |
+
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
| 421 |
+
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
| 422 |
};
|
| 423 |
|
| 424 |
enum llm_tensor_layer {
|
examples/talk-llama/llama-batch.cpp
CHANGED
|
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
|
|
| 59 |
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
| 60 |
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
| 61 |
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
|
| 62 |
-
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d
|
| 63 |
return false;
|
| 64 |
}
|
| 65 |
}
|
|
@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
|
| 477 |
|
| 478 |
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
| 479 |
if (sequential && has_cpl) {
|
| 480 |
-
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
|
| 481 |
|
| 482 |
return {};
|
| 483 |
}
|
|
|
|
| 59 |
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
| 60 |
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
| 61 |
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
|
| 62 |
+
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
|
| 63 |
return false;
|
| 64 |
}
|
| 65 |
}
|
|
|
|
| 477 |
|
| 478 |
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
| 479 |
if (sequential && has_cpl) {
|
| 480 |
+
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
|
| 481 |
|
| 482 |
return {};
|
| 483 |
}
|
examples/talk-llama/llama-chat.cpp
CHANGED
|
@@ -66,6 +66,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
| 66 |
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
| 67 |
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
| 68 |
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
|
|
|
|
|
|
| 69 |
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
| 70 |
};
|
| 71 |
|
|
@@ -191,8 +193,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
| 191 |
return LLM_CHAT_TEMPLATE_LLAMA4;
|
| 192 |
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
| 193 |
return LLM_CHAT_TEMPLATE_DOTS1;
|
| 194 |
-
} else if (tmpl_contains("<|
|
| 195 |
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
| 197 |
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
| 198 |
}
|
|
@@ -619,8 +625,6 @@ int32_t llm_chat_apply_template(
|
|
| 619 |
} else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
|
| 620 |
// Yandex template ("\n\n" is defined as EOT token)
|
| 621 |
|
| 622 |
-
ss << "<s>";
|
| 623 |
-
|
| 624 |
for (size_t i = 0; i < chat.size(); i++) {
|
| 625 |
std::string role(chat[i]->role);
|
| 626 |
if (role == "user") {
|
|
@@ -698,11 +702,37 @@ int32_t llm_chat_apply_template(
|
|
| 698 |
if (role == "system") {
|
| 699 |
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
| 700 |
} else if (role == "assistant") {
|
| 701 |
-
ss <<
|
| 702 |
} else {
|
| 703 |
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
| 704 |
}
|
| 705 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
| 707 |
// moonshotai/Kimi-K2-Instruct
|
| 708 |
for (auto message : chat) {
|
|
|
|
| 66 |
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
| 67 |
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
| 68 |
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
| 69 |
+
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
| 70 |
+
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
| 71 |
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
| 72 |
};
|
| 73 |
|
|
|
|
| 193 |
return LLM_CHAT_TEMPLATE_LLAMA4;
|
| 194 |
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
| 195 |
return LLM_CHAT_TEMPLATE_DOTS1;
|
| 196 |
+
} else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
|
| 197 |
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
| 198 |
+
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
| 199 |
+
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
| 200 |
+
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
| 201 |
+
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
| 202 |
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
| 203 |
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
| 204 |
}
|
|
|
|
| 625 |
} else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
|
| 626 |
// Yandex template ("\n\n" is defined as EOT token)
|
| 627 |
|
|
|
|
|
|
|
| 628 |
for (size_t i = 0; i < chat.size(); i++) {
|
| 629 |
std::string role(chat[i]->role);
|
| 630 |
if (role == "user") {
|
|
|
|
| 702 |
if (role == "system") {
|
| 703 |
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
| 704 |
} else if (role == "assistant") {
|
| 705 |
+
ss << message->content << "<|eos|>";
|
| 706 |
} else {
|
| 707 |
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
| 708 |
}
|
| 709 |
}
|
| 710 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
|
| 711 |
+
// OpenAI MoE (based on Harmony chat template)
|
| 712 |
+
for (auto message : chat) {
|
| 713 |
+
std::string role(message->role);
|
| 714 |
+
ss << "<|start|>" << role << "<|message|>" << message->content;
|
| 715 |
+
ss << (role == "assistant" ? "<|return|>" : "<|end|>");
|
| 716 |
+
}
|
| 717 |
+
if (add_ass) {
|
| 718 |
+
ss << "<|start|>assistant";
|
| 719 |
+
}
|
| 720 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
|
| 721 |
+
// tencent/Hunyuan-4B-Instruct
|
| 722 |
+
for (size_t i = 0; i < chat.size(); i++) {
|
| 723 |
+
std::string role(chat[i]->role);
|
| 724 |
+
if (i == 0) {
|
| 725 |
+
if (role == "system") {
|
| 726 |
+
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
| 727 |
+
}
|
| 728 |
+
}
|
| 729 |
+
|
| 730 |
+
if (role == "assistant") {
|
| 731 |
+
ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
|
| 732 |
+
} else if (role == "user") {
|
| 733 |
+
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
| 734 |
+
}
|
| 735 |
+
}
|
| 736 |
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
| 737 |
// moonshotai/Kimi-K2-Instruct
|
| 738 |
for (auto message : chat) {
|
examples/talk-llama/llama-chat.h
CHANGED
|
@@ -46,6 +46,8 @@ enum llm_chat_template {
|
|
| 46 |
LLM_CHAT_TEMPLATE_SMOLVLM,
|
| 47 |
LLM_CHAT_TEMPLATE_DOTS1,
|
| 48 |
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
|
|
|
|
|
|
| 49 |
LLM_CHAT_TEMPLATE_KIMI_K2,
|
| 50 |
LLM_CHAT_TEMPLATE_UNKNOWN,
|
| 51 |
};
|
|
|
|
| 46 |
LLM_CHAT_TEMPLATE_SMOLVLM,
|
| 47 |
LLM_CHAT_TEMPLATE_DOTS1,
|
| 48 |
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
| 49 |
+
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
| 50 |
+
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
| 51 |
LLM_CHAT_TEMPLATE_KIMI_K2,
|
| 52 |
LLM_CHAT_TEMPLATE_UNKNOWN,
|
| 53 |
};
|
examples/talk-llama/llama-context.cpp
CHANGED
|
@@ -105,7 +105,7 @@ llama_context::llama_context(
|
|
| 105 |
|
| 106 |
{
|
| 107 |
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
| 108 |
-
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) :
|
| 109 |
|
| 110 |
if (!supports_set_rows && !cparams.kv_unified) {
|
| 111 |
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
|
@@ -113,6 +113,15 @@ llama_context::llama_context(
|
|
| 113 |
}
|
| 114 |
}
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
| 117 |
|
| 118 |
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
|
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|
| 716 |
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
| 717 |
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
| 718 |
|
| 719 |
-
if (res->can_reuse(gparams)) {
|
| 720 |
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
| 721 |
|
| 722 |
n_reused++;
|
|
@@ -777,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
| 777 |
const auto & hparams = model.hparams;
|
| 778 |
|
| 779 |
const int64_t n_embd = hparams.n_embd;
|
| 780 |
-
const
|
| 781 |
|
| 782 |
// note: during encode, we always pass the full sequence starting from pos = 0
|
| 783 |
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
@@ -950,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
| 950 |
const auto & vocab = model.vocab;
|
| 951 |
const auto & hparams = model.hparams;
|
| 952 |
|
| 953 |
-
const
|
| 954 |
const int64_t n_embd = hparams.n_embd;
|
| 955 |
|
| 956 |
// when computing embeddings, all tokens are output
|
|
@@ -1319,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
| 1319 |
}
|
| 1320 |
|
| 1321 |
void llama_context::output_reorder() {
|
| 1322 |
-
const
|
| 1323 |
const uint64_t n_embd = model.hparams.n_embd;
|
| 1324 |
|
| 1325 |
-
for (
|
| 1326 |
-
const
|
| 1327 |
-
const
|
| 1328 |
|
| 1329 |
if (logits_size > 0) {
|
| 1330 |
-
for (
|
| 1331 |
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
| 1332 |
}
|
| 1333 |
}
|
| 1334 |
|
| 1335 |
if (embd_size > 0) {
|
| 1336 |
-
for (
|
| 1337 |
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
| 1338 |
}
|
| 1339 |
}
|
|
@@ -1648,30 +1657,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
|
|
| 1648 |
}
|
| 1649 |
}
|
| 1650 |
|
| 1651 |
-
size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
|
| 1652 |
llama_io_write_dummy io;
|
| 1653 |
try {
|
| 1654 |
-
return state_seq_write_data(io, seq_id);
|
| 1655 |
} catch (const std::exception & err) {
|
| 1656 |
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
| 1657 |
return 0;
|
| 1658 |
}
|
| 1659 |
}
|
| 1660 |
|
| 1661 |
-
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
|
| 1662 |
llama_io_write_buffer io(dst, size);
|
| 1663 |
try {
|
| 1664 |
-
return state_seq_write_data(io, seq_id);
|
| 1665 |
} catch (const std::exception & err) {
|
| 1666 |
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
| 1667 |
return 0;
|
| 1668 |
}
|
| 1669 |
}
|
| 1670 |
|
| 1671 |
-
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
|
| 1672 |
llama_io_read_buffer io(src, size);
|
| 1673 |
try {
|
| 1674 |
-
return state_seq_read_data(io, seq_id);
|
| 1675 |
} catch (const std::exception & err) {
|
| 1676 |
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
| 1677 |
return 0;
|
|
@@ -1769,7 +1778,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
|
|
| 1769 |
{
|
| 1770 |
const size_t state_size = file.size() - file.tell();
|
| 1771 |
llama_io_read_file io(&file);
|
| 1772 |
-
const size_t nread = state_seq_read_data(io, seq_id);
|
| 1773 |
if (!nread) {
|
| 1774 |
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
| 1775 |
return 0;
|
|
@@ -1793,7 +1802,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
|
|
| 1793 |
|
| 1794 |
// save the context state using stream saving
|
| 1795 |
llama_io_write_file io(&file);
|
| 1796 |
-
state_seq_write_data(io, seq_id);
|
| 1797 |
|
| 1798 |
const size_t res = file.tell();
|
| 1799 |
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
|
|
@@ -1962,21 +1971,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
|
| 1962 |
return io.n_bytes();
|
| 1963 |
}
|
| 1964 |
|
| 1965 |
-
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
| 1966 |
GGML_UNUSED(seq_id);
|
| 1967 |
|
| 1968 |
if (memory) {
|
| 1969 |
-
memory->state_write(io, seq_id);
|
| 1970 |
}
|
| 1971 |
|
| 1972 |
return io.n_bytes();
|
| 1973 |
}
|
| 1974 |
|
| 1975 |
-
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
| 1976 |
GGML_UNUSED(seq_id);
|
| 1977 |
|
| 1978 |
if (memory) {
|
| 1979 |
-
memory->state_read(io, seq_id);
|
| 1980 |
}
|
| 1981 |
|
| 1982 |
return io.n_bytes();
|
|
@@ -2039,7 +2048,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
|
| 2039 |
opt_params.opt_period = n_batch / n_ubatch;
|
| 2040 |
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
| 2041 |
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
| 2042 |
-
|
| 2043 |
opt_ctx = ggml_opt_init(opt_params);
|
| 2044 |
|
| 2045 |
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
|
@@ -2792,19 +2801,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
|
|
| 2792 |
}
|
| 2793 |
|
| 2794 |
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
|
| 2795 |
-
return ctx
|
| 2796 |
}
|
| 2797 |
|
| 2798 |
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2799 |
ctx->synchronize();
|
| 2800 |
|
| 2801 |
-
return ctx->state_seq_get_data(seq_id, dst, size);
|
| 2802 |
}
|
| 2803 |
|
| 2804 |
-
size_t
|
| 2805 |
ctx->synchronize();
|
| 2806 |
|
| 2807 |
-
return ctx->state_seq_set_data(seq_id, src, size);
|
| 2808 |
}
|
| 2809 |
|
| 2810 |
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
|
|
|
| 105 |
|
| 106 |
{
|
| 107 |
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
| 108 |
+
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
|
| 109 |
|
| 110 |
if (!supports_set_rows && !cparams.kv_unified) {
|
| 111 |
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
|
|
|
| 113 |
}
|
| 114 |
}
|
| 115 |
|
| 116 |
+
{
|
| 117 |
+
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
| 118 |
+
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
| 119 |
+
|
| 120 |
+
if (graph_reuse_disable) {
|
| 121 |
+
LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
| 126 |
|
| 127 |
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
|
|
|
| 725 |
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
| 726 |
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
| 727 |
|
| 728 |
+
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
| 729 |
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
| 730 |
|
| 731 |
n_reused++;
|
|
|
|
| 786 |
const auto & hparams = model.hparams;
|
| 787 |
|
| 788 |
const int64_t n_embd = hparams.n_embd;
|
| 789 |
+
const int64_t n_vocab = model.vocab.n_tokens();
|
| 790 |
|
| 791 |
// note: during encode, we always pass the full sequence starting from pos = 0
|
| 792 |
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
|
|
| 959 |
const auto & vocab = model.vocab;
|
| 960 |
const auto & hparams = model.hparams;
|
| 961 |
|
| 962 |
+
const int64_t n_vocab = vocab.n_tokens();
|
| 963 |
const int64_t n_embd = hparams.n_embd;
|
| 964 |
|
| 965 |
// when computing embeddings, all tokens are output
|
|
|
|
| 1328 |
}
|
| 1329 |
|
| 1330 |
void llama_context::output_reorder() {
|
| 1331 |
+
const uint64_t n_vocab = model.vocab.n_tokens();
|
| 1332 |
const uint64_t n_embd = model.hparams.n_embd;
|
| 1333 |
|
| 1334 |
+
for (size_t s = 0; s < output_swaps.size(); ++s) {
|
| 1335 |
+
const uint64_t i0 = output_swaps[s].i0;
|
| 1336 |
+
const uint64_t i1 = output_swaps[s].i1;
|
| 1337 |
|
| 1338 |
if (logits_size > 0) {
|
| 1339 |
+
for (uint64_t k = 0; k < n_vocab; k++) {
|
| 1340 |
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
| 1341 |
}
|
| 1342 |
}
|
| 1343 |
|
| 1344 |
if (embd_size > 0) {
|
| 1345 |
+
for (uint64_t k = 0; k < n_embd; k++) {
|
| 1346 |
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
| 1347 |
}
|
| 1348 |
}
|
|
|
|
| 1657 |
}
|
| 1658 |
}
|
| 1659 |
|
| 1660 |
+
size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 1661 |
llama_io_write_dummy io;
|
| 1662 |
try {
|
| 1663 |
+
return state_seq_write_data(io, seq_id, flags);
|
| 1664 |
} catch (const std::exception & err) {
|
| 1665 |
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
| 1666 |
return 0;
|
| 1667 |
}
|
| 1668 |
}
|
| 1669 |
|
| 1670 |
+
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
|
| 1671 |
llama_io_write_buffer io(dst, size);
|
| 1672 |
try {
|
| 1673 |
+
return state_seq_write_data(io, seq_id, flags);
|
| 1674 |
} catch (const std::exception & err) {
|
| 1675 |
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
| 1676 |
return 0;
|
| 1677 |
}
|
| 1678 |
}
|
| 1679 |
|
| 1680 |
+
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
|
| 1681 |
llama_io_read_buffer io(src, size);
|
| 1682 |
try {
|
| 1683 |
+
return state_seq_read_data(io, seq_id, flags);
|
| 1684 |
} catch (const std::exception & err) {
|
| 1685 |
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
| 1686 |
return 0;
|
|
|
|
| 1778 |
{
|
| 1779 |
const size_t state_size = file.size() - file.tell();
|
| 1780 |
llama_io_read_file io(&file);
|
| 1781 |
+
const size_t nread = state_seq_read_data(io, seq_id, 0);
|
| 1782 |
if (!nread) {
|
| 1783 |
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
| 1784 |
return 0;
|
|
|
|
| 1802 |
|
| 1803 |
// save the context state using stream saving
|
| 1804 |
llama_io_write_file io(&file);
|
| 1805 |
+
state_seq_write_data(io, seq_id, 0);
|
| 1806 |
|
| 1807 |
const size_t res = file.tell();
|
| 1808 |
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
|
|
|
|
| 1971 |
return io.n_bytes();
|
| 1972 |
}
|
| 1973 |
|
| 1974 |
+
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 1975 |
GGML_UNUSED(seq_id);
|
| 1976 |
|
| 1977 |
if (memory) {
|
| 1978 |
+
memory->state_write(io, seq_id, flags);
|
| 1979 |
}
|
| 1980 |
|
| 1981 |
return io.n_bytes();
|
| 1982 |
}
|
| 1983 |
|
| 1984 |
+
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 1985 |
GGML_UNUSED(seq_id);
|
| 1986 |
|
| 1987 |
if (memory) {
|
| 1988 |
+
memory->state_read(io, seq_id, flags);
|
| 1989 |
}
|
| 1990 |
|
| 1991 |
return io.n_bytes();
|
|
|
|
| 2048 |
opt_params.opt_period = n_batch / n_ubatch;
|
| 2049 |
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
| 2050 |
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
| 2051 |
+
opt_params.optimizer = lopt_params.optimizer_type;
|
| 2052 |
opt_ctx = ggml_opt_init(opt_params);
|
| 2053 |
|
| 2054 |
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
|
|
|
| 2801 |
}
|
| 2802 |
|
| 2803 |
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
|
| 2804 |
+
return llama_state_seq_get_size_ext(ctx, seq_id, 0);
|
| 2805 |
}
|
| 2806 |
|
| 2807 |
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
| 2808 |
+
return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
|
| 2809 |
+
}
|
| 2810 |
+
|
| 2811 |
+
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
|
| 2812 |
+
return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
|
| 2813 |
+
}
|
| 2814 |
+
|
| 2815 |
+
size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 2816 |
+
return ctx->state_seq_get_size(seq_id, flags);
|
| 2817 |
+
}
|
| 2818 |
+
|
| 2819 |
+
size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 2820 |
ctx->synchronize();
|
| 2821 |
|
| 2822 |
+
return ctx->state_seq_get_data(seq_id, dst, size, flags);
|
| 2823 |
}
|
| 2824 |
|
| 2825 |
+
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 2826 |
ctx->synchronize();
|
| 2827 |
|
| 2828 |
+
return ctx->state_seq_set_data(seq_id, src, size, flags);
|
| 2829 |
}
|
| 2830 |
|
| 2831 |
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
examples/talk-llama/llama-context.h
CHANGED
|
@@ -111,9 +111,9 @@ struct llama_context {
|
|
| 111 |
size_t state_get_data( uint8_t * dst, size_t size);
|
| 112 |
size_t state_set_data(const uint8_t * src, size_t size);
|
| 113 |
|
| 114 |
-
size_t state_seq_get_size(llama_seq_id seq_id);
|
| 115 |
-
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
|
| 116 |
-
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
|
| 117 |
|
| 118 |
bool state_load_file(
|
| 119 |
const char * filepath,
|
|
@@ -152,6 +152,7 @@ struct llama_context {
|
|
| 152 |
|
| 153 |
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
| 154 |
|
|
|
|
| 155 |
void opt_epoch(
|
| 156 |
ggml_opt_dataset_t dataset,
|
| 157 |
ggml_opt_result_t result_train,
|
|
@@ -212,8 +213,8 @@ private:
|
|
| 212 |
size_t state_write_data(llama_io_write_i & io);
|
| 213 |
size_t state_read_data (llama_io_read_i & io);
|
| 214 |
|
| 215 |
-
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
|
| 216 |
-
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
|
| 217 |
|
| 218 |
//
|
| 219 |
// members
|
|
@@ -289,7 +290,10 @@ private:
|
|
| 289 |
|
| 290 |
// env: LLAMA_SET_ROWS (temporary)
|
| 291 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
| 292 |
-
bool supports_set_rows =
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
// perf
|
| 295 |
mutable int64_t t_start_us = 0;
|
|
|
|
| 111 |
size_t state_get_data( uint8_t * dst, size_t size);
|
| 112 |
size_t state_set_data(const uint8_t * src, size_t size);
|
| 113 |
|
| 114 |
+
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
|
| 115 |
+
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
|
| 116 |
+
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
|
| 117 |
|
| 118 |
bool state_load_file(
|
| 119 |
const char * filepath,
|
|
|
|
| 152 |
|
| 153 |
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
| 154 |
|
| 155 |
+
// TODO: more flexible combinations of logical/physical batch size and context size
|
| 156 |
void opt_epoch(
|
| 157 |
ggml_opt_dataset_t dataset,
|
| 158 |
ggml_opt_result_t result_train,
|
|
|
|
| 213 |
size_t state_write_data(llama_io_write_i & io);
|
| 214 |
size_t state_read_data (llama_io_read_i & io);
|
| 215 |
|
| 216 |
+
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
| 217 |
+
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
| 218 |
|
| 219 |
//
|
| 220 |
// members
|
|
|
|
| 290 |
|
| 291 |
// env: LLAMA_SET_ROWS (temporary)
|
| 292 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
| 293 |
+
bool supports_set_rows = true;
|
| 294 |
+
|
| 295 |
+
// env: LLAMA_GRAPH_REUSE_DISABLE
|
| 296 |
+
bool graph_reuse_disable = false;
|
| 297 |
|
| 298 |
// perf
|
| 299 |
mutable int64_t t_start_us = 0;
|
examples/talk-llama/llama-graph.cpp
CHANGED
|
@@ -188,38 +188,23 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
|
| 188 |
|
| 189 |
void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
| 190 |
const int64_t n_tokens = ubatch->n_tokens;
|
| 191 |
-
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
| 192 |
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
| 193 |
|
| 194 |
if (cparams.embeddings && (
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
|
|
|
| 198 |
GGML_ASSERT(cls);
|
| 199 |
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
| 200 |
|
| 201 |
uint32_t * data = (uint32_t *) cls->data;
|
| 202 |
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
| 207 |
-
const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
| 208 |
-
|
| 209 |
-
data[seq_idx] = i;
|
| 210 |
-
}
|
| 211 |
-
}
|
| 212 |
-
}
|
| 213 |
-
|
| 214 |
-
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
| 215 |
-
GGML_ASSERT(cls);
|
| 216 |
-
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
| 217 |
-
|
| 218 |
-
uint32_t * data = (uint32_t *) cls->data;
|
| 219 |
-
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
| 220 |
|
| 221 |
-
|
| 222 |
-
std::vector<int> last_row(n_seqs_unq, -1);
|
| 223 |
|
| 224 |
for (int i = 0; i < n_tokens; ++i) {
|
| 225 |
const llama_pos pos = ubatch->pos[i];
|
|
@@ -228,16 +213,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
|
| 228 |
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
| 229 |
const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
| 230 |
|
| 231 |
-
if (
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
}
|
| 235 |
}
|
| 236 |
}
|
| 237 |
|
| 238 |
for (int s = 0; s < n_seqs_unq; ++s) {
|
| 239 |
-
if (
|
| 240 |
-
data[s] =
|
| 241 |
}
|
| 242 |
}
|
| 243 |
}
|
|
@@ -751,6 +740,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
| 751 |
cur = ggml_reglu(ctx0, cur);
|
| 752 |
cb(cur, "ffn_reglu", il);
|
| 753 |
} break;
|
|
|
|
|
|
|
| 754 |
}
|
| 755 |
|
| 756 |
if (gate && type_gate == LLM_FFN_PAR) {
|
|
@@ -760,8 +751,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
| 760 |
|
| 761 |
if (down) {
|
| 762 |
cur = build_lora_mm(down, cur);
|
| 763 |
-
if (arch == LLM_ARCH_GLM4) {
|
| 764 |
-
// GLM4
|
| 765 |
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
| 766 |
}
|
| 767 |
}
|
|
@@ -796,13 +787,64 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 796 |
bool scale_w,
|
| 797 |
float w_scale,
|
| 798 |
llama_expert_gating_func_type gating_op,
|
| 799 |
-
int il
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
const int64_t n_embd = cur->ne[0];
|
| 801 |
const int64_t n_tokens = cur->ne[1];
|
| 802 |
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
| 803 |
|
| 804 |
-
ggml_tensor * logits =
|
| 805 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
|
| 807 |
ggml_tensor * probs = nullptr;
|
| 808 |
switch (gating_op) {
|
|
@@ -814,6 +856,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 814 |
{
|
| 815 |
probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
| 816 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
default:
|
| 818 |
GGML_ABORT("fatal error");
|
| 819 |
}
|
|
@@ -842,6 +888,13 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 842 |
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
| 843 |
cb(weights, "ffn_moe_weights", il);
|
| 844 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 845 |
if (norm_w) {
|
| 846 |
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
| 847 |
|
|
@@ -870,6 +923,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 870 |
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
| 871 |
cb(up, "ffn_moe_up", il);
|
| 872 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 873 |
ggml_tensor * experts = nullptr;
|
| 874 |
if (gate_exps) {
|
| 875 |
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
@@ -878,6 +936,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 878 |
cur = up;
|
| 879 |
}
|
| 880 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
switch (type_op) {
|
| 882 |
case LLM_FFN_SILU:
|
| 883 |
if (gate_exps) {
|
|
@@ -895,6 +958,22 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 895 |
cur = ggml_gelu(ctx0, cur);
|
| 896 |
cb(cur, "ffn_moe_gelu", il);
|
| 897 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
default:
|
| 899 |
GGML_ABORT("fatal error");
|
| 900 |
}
|
|
@@ -902,6 +981,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 902 |
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
| 903 |
cb(experts, "ffn_moe_down", il);
|
| 904 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
if (!weight_before_ffn) {
|
| 906 |
experts = ggml_mul(ctx0, experts, weights);
|
| 907 |
cb(cur, "ffn_moe_weighted", il);
|
|
@@ -1140,6 +1224,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
| 1140 |
ggml_tensor * kq_b,
|
| 1141 |
ggml_tensor * kq_mask,
|
| 1142 |
ggml_tensor * v_mla,
|
|
|
|
| 1143 |
float kq_scale) const {
|
| 1144 |
const bool v_trans = v->nb[1] > v->nb[2];
|
| 1145 |
|
|
@@ -1176,7 +1261,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
| 1176 |
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
| 1177 |
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
| 1178 |
|
| 1179 |
-
|
|
|
|
| 1180 |
|
| 1181 |
if (v_mla) {
|
| 1182 |
#if 0
|
|
@@ -1224,6 +1310,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
| 1224 |
}
|
| 1225 |
|
| 1226 |
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
|
|
| 1227 |
|
| 1228 |
if (!v_trans) {
|
| 1229 |
// note: avoid this branch
|
|
@@ -1294,7 +1381,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
| 1294 |
ggml_tensor * k = k_cur;
|
| 1295 |
ggml_tensor * v = v_cur;
|
| 1296 |
|
| 1297 |
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
| 1298 |
cb(cur, "kqv_out", il);
|
| 1299 |
|
| 1300 |
if (wo) {
|
|
@@ -1382,13 +1469,13 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
| 1382 |
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
| 1383 |
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
| 1384 |
|
| 1385 |
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
| 1386 |
cb(cur, "kqv_out", il);
|
| 1387 |
|
| 1388 |
if (wo) {
|
| 1389 |
cur = build_lora_mm(wo, cur);
|
| 1390 |
-
if (arch == LLM_ARCH_GLM4) {
|
| 1391 |
-
// GLM4
|
| 1392 |
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
| 1393 |
}
|
| 1394 |
}
|
|
@@ -1411,6 +1498,32 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
| 1411 |
ggml_tensor * v_mla,
|
| 1412 |
float kq_scale,
|
| 1413 |
int il) const {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1414 |
// these nodes are added to the graph together so that they are not reordered
|
| 1415 |
// by doing so, the number of splits in the graph is reduced
|
| 1416 |
ggml_build_forward_expand(gf, q_cur);
|
|
@@ -1448,7 +1561,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
| 1448 |
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
| 1449 |
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
| 1450 |
|
| 1451 |
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
| 1452 |
cb(cur, "kqv_out", il);
|
| 1453 |
|
| 1454 |
if (wo) {
|
|
@@ -1502,7 +1615,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
| 1502 |
ggml_tensor * k = k_cur;
|
| 1503 |
ggml_tensor * v = v_cur;
|
| 1504 |
|
| 1505 |
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
| 1506 |
cb(cur, "kqv_out", il);
|
| 1507 |
|
| 1508 |
if (wo) {
|
|
@@ -1561,16 +1674,17 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
|
| 1561 |
|
| 1562 |
ggml_tensor * llm_graph_context::build_rs(
|
| 1563 |
ggml_tensor * s,
|
| 1564 |
-
ggml_tensor *
|
|
|
|
| 1565 |
int32_t state_size,
|
| 1566 |
int32_t n_seqs,
|
| 1567 |
-
uint32_t
|
| 1568 |
-
uint32_t
|
| 1569 |
-
uint32_t
|
| 1570 |
int32_t rs_zero,
|
| 1571 |
const llm_graph_get_rows_fn & get_state_rows) const {
|
| 1572 |
|
| 1573 |
-
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size,
|
| 1574 |
|
| 1575 |
// Clear a single state which will then be copied to the other cleared states.
|
| 1576 |
// Note that this is a no-op when the view is zero-sized.
|
|
@@ -1578,39 +1692,44 @@ ggml_tensor * llm_graph_context::build_rs(
|
|
| 1578 |
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
| 1579 |
|
| 1580 |
// copy states
|
| 1581 |
-
// NOTE: assuming the copy destinations are ALL contained between
|
| 1582 |
-
// {state_size,
|
| 1583 |
-
ggml_tensor * output_states = get_state_rows(ctx0, states,
|
| 1584 |
ggml_build_forward_expand(gf, output_states);
|
| 1585 |
|
| 1586 |
-
// copy extra states which won't be changed further (between n_seqs and
|
| 1587 |
-
ggml_tensor * states_extra = ggml_get_rows(ctx0, states,
|
| 1588 |
ggml_build_forward_expand(gf,
|
| 1589 |
ggml_cpy(ctx0,
|
| 1590 |
states_extra,
|
| 1591 |
-
ggml_view_1d(ctx0, s, state_size*(
|
| 1592 |
|
| 1593 |
return output_states;
|
| 1594 |
}
|
| 1595 |
|
| 1596 |
static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
| 1597 |
ggml_context * ctx0,
|
|
|
|
| 1598 |
const llama_memory_recurrent_context * mctx_cur) {
|
| 1599 |
|
| 1600 |
auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
|
| 1601 |
|
| 1602 |
-
const
|
|
|
|
| 1603 |
|
| 1604 |
inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
|
| 1605 |
ggml_set_input(inp->s_copy);
|
| 1606 |
|
|
|
|
|
|
|
|
|
|
| 1607 |
return inp;
|
| 1608 |
}
|
| 1609 |
|
| 1610 |
llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
|
| 1611 |
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
| 1612 |
|
| 1613 |
-
auto inp = build_rs_inp_impl(ctx0, mctx_cur);
|
| 1614 |
|
| 1615 |
return (llm_graph_input_rs *) res->add_input(std::move(inp));
|
| 1616 |
}
|
|
@@ -1623,7 +1742,9 @@ ggml_tensor * llm_graph_context::build_rs(
|
|
| 1623 |
const llm_graph_get_rows_fn & get_state_rows) const {
|
| 1624 |
const auto * kv_state = inp->mctx;
|
| 1625 |
|
| 1626 |
-
return build_rs(s, inp->
|
|
|
|
|
|
|
| 1627 |
}
|
| 1628 |
|
| 1629 |
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
|
@@ -1670,7 +1791,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
|
| 1670 |
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
| 1671 |
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
| 1672 |
|
| 1673 |
-
auto inp_rs = build_rs_inp_impl(ctx0, mctx_cur->get_recr());
|
| 1674 |
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
| 1675 |
|
| 1676 |
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
|
|
|
| 188 |
|
| 189 |
void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
| 190 |
const int64_t n_tokens = ubatch->n_tokens;
|
|
|
|
| 191 |
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
| 192 |
|
| 193 |
if (cparams.embeddings && (
|
| 194 |
+
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
| 195 |
+
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
|
| 196 |
+
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
|
| 197 |
+
)) {
|
| 198 |
GGML_ASSERT(cls);
|
| 199 |
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
| 200 |
|
| 201 |
uint32_t * data = (uint32_t *) cls->data;
|
| 202 |
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
| 203 |
|
| 204 |
+
std::vector<int> target_pos(n_seqs_unq, -1);
|
| 205 |
+
std::vector<int> target_row(n_seqs_unq, -1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
+
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
|
|
|
|
| 208 |
|
| 209 |
for (int i = 0; i < n_tokens; ++i) {
|
| 210 |
const llama_pos pos = ubatch->pos[i];
|
|
|
|
| 213 |
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
| 214 |
const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
| 215 |
|
| 216 |
+
if (
|
| 217 |
+
(target_pos[seq_idx] == -1) ||
|
| 218 |
+
( last && pos >= target_pos[seq_idx]) ||
|
| 219 |
+
(!last && pos < target_pos[seq_idx])
|
| 220 |
+
) {
|
| 221 |
+
target_pos[seq_idx] = pos;
|
| 222 |
+
target_row[seq_idx] = i;
|
| 223 |
}
|
| 224 |
}
|
| 225 |
}
|
| 226 |
|
| 227 |
for (int s = 0; s < n_seqs_unq; ++s) {
|
| 228 |
+
if (target_row[s] >= 0) {
|
| 229 |
+
data[s] = target_row[s];
|
| 230 |
}
|
| 231 |
}
|
| 232 |
}
|
|
|
|
| 740 |
cur = ggml_reglu(ctx0, cur);
|
| 741 |
cb(cur, "ffn_reglu", il);
|
| 742 |
} break;
|
| 743 |
+
default:
|
| 744 |
+
GGML_ABORT("fatal error");
|
| 745 |
}
|
| 746 |
|
| 747 |
if (gate && type_gate == LLM_FFN_PAR) {
|
|
|
|
| 751 |
|
| 752 |
if (down) {
|
| 753 |
cur = build_lora_mm(down, cur);
|
| 754 |
+
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
| 755 |
+
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
| 756 |
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
| 757 |
}
|
| 758 |
}
|
|
|
|
| 787 |
bool scale_w,
|
| 788 |
float w_scale,
|
| 789 |
llama_expert_gating_func_type gating_op,
|
| 790 |
+
int il,
|
| 791 |
+
ggml_tensor * probs_in) const {
|
| 792 |
+
return build_moe_ffn(
|
| 793 |
+
cur,
|
| 794 |
+
gate_inp, /* gate_inp_b */ nullptr,
|
| 795 |
+
up_exps, /* up_exps_b */ nullptr,
|
| 796 |
+
gate_exps, /* gate_exps_b */ nullptr,
|
| 797 |
+
down_exps, /* down_exps_b */ nullptr,
|
| 798 |
+
exp_probs_b,
|
| 799 |
+
n_expert,
|
| 800 |
+
n_expert_used,
|
| 801 |
+
type_op,
|
| 802 |
+
norm_w,
|
| 803 |
+
scale_w,
|
| 804 |
+
w_scale,
|
| 805 |
+
gating_op,
|
| 806 |
+
il,
|
| 807 |
+
probs_in
|
| 808 |
+
);
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
ggml_tensor * llm_graph_context::build_moe_ffn(
|
| 812 |
+
ggml_tensor * cur,
|
| 813 |
+
ggml_tensor * gate_inp,
|
| 814 |
+
ggml_tensor * gate_inp_b,
|
| 815 |
+
ggml_tensor * up_exps,
|
| 816 |
+
ggml_tensor * up_exps_b,
|
| 817 |
+
ggml_tensor * gate_exps,
|
| 818 |
+
ggml_tensor * gate_exps_b,
|
| 819 |
+
ggml_tensor * down_exps,
|
| 820 |
+
ggml_tensor * down_exps_b,
|
| 821 |
+
ggml_tensor * exp_probs_b,
|
| 822 |
+
int64_t n_expert,
|
| 823 |
+
int64_t n_expert_used,
|
| 824 |
+
llm_ffn_op_type type_op,
|
| 825 |
+
bool norm_w,
|
| 826 |
+
bool scale_w,
|
| 827 |
+
float w_scale,
|
| 828 |
+
llama_expert_gating_func_type gating_op,
|
| 829 |
+
int il,
|
| 830 |
+
ggml_tensor * probs_in) const {
|
| 831 |
const int64_t n_embd = cur->ne[0];
|
| 832 |
const int64_t n_tokens = cur->ne[1];
|
| 833 |
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
| 834 |
|
| 835 |
+
ggml_tensor * logits = nullptr;
|
| 836 |
+
|
| 837 |
+
if (probs_in == nullptr) {
|
| 838 |
+
logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
|
| 839 |
+
cb(logits, "ffn_moe_logits", il);
|
| 840 |
+
} else {
|
| 841 |
+
logits = probs_in;
|
| 842 |
+
}
|
| 843 |
+
|
| 844 |
+
if (gate_inp_b) {
|
| 845 |
+
logits = ggml_add(ctx0, logits, gate_inp_b);
|
| 846 |
+
cb(logits, "ffn_moe_logits_biased", il);
|
| 847 |
+
}
|
| 848 |
|
| 849 |
ggml_tensor * probs = nullptr;
|
| 850 |
switch (gating_op) {
|
|
|
|
| 856 |
{
|
| 857 |
probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
| 858 |
} break;
|
| 859 |
+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
|
| 860 |
+
{
|
| 861 |
+
probs = logits; // [n_expert, n_tokens]
|
| 862 |
+
} break;
|
| 863 |
default:
|
| 864 |
GGML_ABORT("fatal error");
|
| 865 |
}
|
|
|
|
| 888 |
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
| 889 |
cb(weights, "ffn_moe_weights", il);
|
| 890 |
|
| 891 |
+
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
| 892 |
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
| 893 |
+
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
| 894 |
+
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
|
| 895 |
+
cb(weights, "ffn_moe_weights_softmax", il);
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
if (norm_w) {
|
| 899 |
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
| 900 |
|
|
|
|
| 923 |
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
| 924 |
cb(up, "ffn_moe_up", il);
|
| 925 |
|
| 926 |
+
if (up_exps_b) {
|
| 927 |
+
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
|
| 928 |
+
cb(up, "ffn_moe_up_biased", il);
|
| 929 |
+
}
|
| 930 |
+
|
| 931 |
ggml_tensor * experts = nullptr;
|
| 932 |
if (gate_exps) {
|
| 933 |
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
|
|
| 936 |
cur = up;
|
| 937 |
}
|
| 938 |
|
| 939 |
+
if (gate_exps_b) {
|
| 940 |
+
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
|
| 941 |
+
cb(cur, "ffn_moe_gate_biased", il);
|
| 942 |
+
}
|
| 943 |
+
|
| 944 |
switch (type_op) {
|
| 945 |
case LLM_FFN_SILU:
|
| 946 |
if (gate_exps) {
|
|
|
|
| 958 |
cur = ggml_gelu(ctx0, cur);
|
| 959 |
cb(cur, "ffn_moe_gelu", il);
|
| 960 |
} break;
|
| 961 |
+
case LLM_FFN_SWIGLU_OAI_MOE:
|
| 962 |
+
{
|
| 963 |
+
// TODO: move to hparams?
|
| 964 |
+
constexpr float alpha = 1.702f;
|
| 965 |
+
constexpr float limit = 7.0f;
|
| 966 |
+
cur = ggml_swiglu_oai(ctx0, cur, up, alpha, limit);
|
| 967 |
+
cb(cur, "ffn_moe_swiglu_oai", il);
|
| 968 |
+
} break;
|
| 969 |
+
case LLM_FFN_RELU:
|
| 970 |
+
if (gate_exps) {
|
| 971 |
+
cur = ggml_reglu_split(ctx0, cur, up);
|
| 972 |
+
cb(cur, "ffn_moe_reglu", il);
|
| 973 |
+
} else {
|
| 974 |
+
cur = ggml_relu(ctx0, cur);
|
| 975 |
+
cb(cur, "ffn_moe_relu", il);
|
| 976 |
+
} break;
|
| 977 |
default:
|
| 978 |
GGML_ABORT("fatal error");
|
| 979 |
}
|
|
|
|
| 981 |
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
| 982 |
cb(experts, "ffn_moe_down", il);
|
| 983 |
|
| 984 |
+
if (down_exps_b) {
|
| 985 |
+
experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
|
| 986 |
+
cb(experts, "ffn_moe_down_biased", il);
|
| 987 |
+
}
|
| 988 |
+
|
| 989 |
if (!weight_before_ffn) {
|
| 990 |
experts = ggml_mul(ctx0, experts, weights);
|
| 991 |
cb(cur, "ffn_moe_weighted", il);
|
|
|
|
| 1224 |
ggml_tensor * kq_b,
|
| 1225 |
ggml_tensor * kq_mask,
|
| 1226 |
ggml_tensor * v_mla,
|
| 1227 |
+
ggml_tensor * sinks,
|
| 1228 |
float kq_scale) const {
|
| 1229 |
const bool v_trans = v->nb[1] > v->nb[2];
|
| 1230 |
|
|
|
|
| 1261 |
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
| 1262 |
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
| 1263 |
|
| 1264 |
+
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
| 1265 |
+
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
| 1266 |
|
| 1267 |
if (v_mla) {
|
| 1268 |
#if 0
|
|
|
|
| 1310 |
}
|
| 1311 |
|
| 1312 |
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
| 1313 |
+
ggml_soft_max_add_sinks(kq, sinks);
|
| 1314 |
|
| 1315 |
if (!v_trans) {
|
| 1316 |
// note: avoid this branch
|
|
|
|
| 1381 |
ggml_tensor * k = k_cur;
|
| 1382 |
ggml_tensor * v = v_cur;
|
| 1383 |
|
| 1384 |
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
| 1385 |
cb(cur, "kqv_out", il);
|
| 1386 |
|
| 1387 |
if (wo) {
|
|
|
|
| 1469 |
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
| 1470 |
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
| 1471 |
|
| 1472 |
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
| 1473 |
cb(cur, "kqv_out", il);
|
| 1474 |
|
| 1475 |
if (wo) {
|
| 1476 |
cur = build_lora_mm(wo, cur);
|
| 1477 |
+
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
| 1478 |
+
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
| 1479 |
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
| 1480 |
}
|
| 1481 |
}
|
|
|
|
| 1498 |
ggml_tensor * v_mla,
|
| 1499 |
float kq_scale,
|
| 1500 |
int il) const {
|
| 1501 |
+
return build_attn_with_sinks(
|
| 1502 |
+
inp,
|
| 1503 |
+
wo,
|
| 1504 |
+
wo_b,
|
| 1505 |
+
q_cur,
|
| 1506 |
+
k_cur,
|
| 1507 |
+
v_cur,
|
| 1508 |
+
kq_b,
|
| 1509 |
+
v_mla,
|
| 1510 |
+
nullptr,
|
| 1511 |
+
kq_scale,
|
| 1512 |
+
il);
|
| 1513 |
+
}
|
| 1514 |
+
|
| 1515 |
+
ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
| 1516 |
+
llm_graph_input_attn_kv_unified_iswa * inp,
|
| 1517 |
+
ggml_tensor * wo,
|
| 1518 |
+
ggml_tensor * wo_b,
|
| 1519 |
+
ggml_tensor * q_cur,
|
| 1520 |
+
ggml_tensor * k_cur,
|
| 1521 |
+
ggml_tensor * v_cur,
|
| 1522 |
+
ggml_tensor * kq_b,
|
| 1523 |
+
ggml_tensor * v_mla,
|
| 1524 |
+
ggml_tensor * sinks,
|
| 1525 |
+
float kq_scale,
|
| 1526 |
+
int il) const {
|
| 1527 |
// these nodes are added to the graph together so that they are not reordered
|
| 1528 |
// by doing so, the number of splits in the graph is reduced
|
| 1529 |
ggml_build_forward_expand(gf, q_cur);
|
|
|
|
| 1561 |
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
| 1562 |
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
| 1563 |
|
| 1564 |
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
|
| 1565 |
cb(cur, "kqv_out", il);
|
| 1566 |
|
| 1567 |
if (wo) {
|
|
|
|
| 1615 |
ggml_tensor * k = k_cur;
|
| 1616 |
ggml_tensor * v = v_cur;
|
| 1617 |
|
| 1618 |
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
| 1619 |
cb(cur, "kqv_out", il);
|
| 1620 |
|
| 1621 |
if (wo) {
|
|
|
|
| 1674 |
|
| 1675 |
ggml_tensor * llm_graph_context::build_rs(
|
| 1676 |
ggml_tensor * s,
|
| 1677 |
+
ggml_tensor * state_copy_main,
|
| 1678 |
+
ggml_tensor * state_copy_extra,
|
| 1679 |
int32_t state_size,
|
| 1680 |
int32_t n_seqs,
|
| 1681 |
+
uint32_t n_rs,
|
| 1682 |
+
uint32_t rs_head,
|
| 1683 |
+
uint32_t rs_size,
|
| 1684 |
int32_t rs_zero,
|
| 1685 |
const llm_graph_get_rows_fn & get_state_rows) const {
|
| 1686 |
|
| 1687 |
+
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
|
| 1688 |
|
| 1689 |
// Clear a single state which will then be copied to the other cleared states.
|
| 1690 |
// Note that this is a no-op when the view is zero-sized.
|
|
|
|
| 1692 |
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
| 1693 |
|
| 1694 |
// copy states
|
| 1695 |
+
// NOTE: assuming the copy destinations are ALL contained between rs_head and rs_head + n_rs
|
| 1696 |
+
// {state_size, rs_size} -> {state_size, n_seqs}
|
| 1697 |
+
ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
|
| 1698 |
ggml_build_forward_expand(gf, output_states);
|
| 1699 |
|
| 1700 |
+
// copy extra states which won't be changed further (between n_seqs and n_rs)
|
| 1701 |
+
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
|
| 1702 |
ggml_build_forward_expand(gf,
|
| 1703 |
ggml_cpy(ctx0,
|
| 1704 |
states_extra,
|
| 1705 |
+
ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
|
| 1706 |
|
| 1707 |
return output_states;
|
| 1708 |
}
|
| 1709 |
|
| 1710 |
static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
| 1711 |
ggml_context * ctx0,
|
| 1712 |
+
const llama_ubatch & ubatch,
|
| 1713 |
const llama_memory_recurrent_context * mctx_cur) {
|
| 1714 |
|
| 1715 |
auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
|
| 1716 |
|
| 1717 |
+
const int64_t n_rs = mctx_cur->get_n_rs();
|
| 1718 |
+
const int64_t n_seqs = ubatch.n_seqs;
|
| 1719 |
|
| 1720 |
inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
|
| 1721 |
ggml_set_input(inp->s_copy);
|
| 1722 |
|
| 1723 |
+
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
| 1724 |
+
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
| 1725 |
+
|
| 1726 |
return inp;
|
| 1727 |
}
|
| 1728 |
|
| 1729 |
llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
|
| 1730 |
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
| 1731 |
|
| 1732 |
+
auto inp = build_rs_inp_impl(ctx0, ubatch, mctx_cur);
|
| 1733 |
|
| 1734 |
return (llm_graph_input_rs *) res->add_input(std::move(inp));
|
| 1735 |
}
|
|
|
|
| 1742 |
const llm_graph_get_rows_fn & get_state_rows) const {
|
| 1743 |
const auto * kv_state = inp->mctx;
|
| 1744 |
|
| 1745 |
+
return build_rs(s, inp->s_copy_main, inp->s_copy_extra, state_size, n_seqs,
|
| 1746 |
+
kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(),
|
| 1747 |
+
get_state_rows);
|
| 1748 |
}
|
| 1749 |
|
| 1750 |
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
|
|
|
| 1791 |
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
| 1792 |
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
| 1793 |
|
| 1794 |
+
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
| 1795 |
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
| 1796 |
|
| 1797 |
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
examples/talk-llama/llama-graph.h
CHANGED
|
@@ -39,6 +39,7 @@ enum llm_ffn_op_type {
|
|
| 39 |
LLM_FFN_SWIGLU,
|
| 40 |
LLM_FFN_GEGLU,
|
| 41 |
LLM_FFN_REGLU,
|
|
|
|
| 42 |
};
|
| 43 |
|
| 44 |
enum llm_ffn_gate_type {
|
|
@@ -144,7 +145,7 @@ public:
|
|
| 144 |
|
| 145 |
ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
|
| 146 |
|
| 147 |
-
const llama_hparams
|
| 148 |
};
|
| 149 |
|
| 150 |
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
|
@@ -158,7 +159,7 @@ public:
|
|
| 158 |
|
| 159 |
ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
|
| 160 |
|
| 161 |
-
const llama_hparams
|
| 162 |
|
| 163 |
const llama_kv_cache_unified_context * mctx;
|
| 164 |
};
|
|
@@ -177,8 +178,8 @@ public:
|
|
| 177 |
|
| 178 |
ggml_tensor * out_ids; // I32 [n_outputs]
|
| 179 |
|
| 180 |
-
const llama_hparams
|
| 181 |
-
const llama_cparams
|
| 182 |
|
| 183 |
const uint32_t n_outputs;
|
| 184 |
};
|
|
@@ -192,7 +193,7 @@ public:
|
|
| 192 |
|
| 193 |
ggml_tensor * mean; // F32 [n_batch, n_batch]
|
| 194 |
|
| 195 |
-
const llama_cparams
|
| 196 |
};
|
| 197 |
|
| 198 |
class llm_graph_input_cls : public llm_graph_input_i {
|
|
@@ -204,7 +205,7 @@ public:
|
|
| 204 |
|
| 205 |
ggml_tensor * cls; // I32 [n_batch]
|
| 206 |
|
| 207 |
-
const llama_cparams
|
| 208 |
};
|
| 209 |
|
| 210 |
class llm_graph_input_rs : public llm_graph_input_i {
|
|
@@ -214,7 +215,12 @@ public:
|
|
| 214 |
|
| 215 |
void set_input(const llama_ubatch * ubatch) override;
|
| 216 |
|
| 217 |
-
ggml_tensor * s_copy;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
const llama_memory_recurrent_context * mctx;
|
| 220 |
};
|
|
@@ -247,8 +253,8 @@ public:
|
|
| 247 |
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
|
| 248 |
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
|
| 249 |
|
| 250 |
-
const llama_hparams
|
| 251 |
-
const llama_cparams
|
| 252 |
};
|
| 253 |
|
| 254 |
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
|
|
@@ -278,8 +284,11 @@ public:
|
|
| 278 |
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
| 279 |
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
| 280 |
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
const llama_kv_cache_unified_context * mctx;
|
| 285 |
};
|
|
@@ -318,8 +327,8 @@ public:
|
|
| 318 |
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
| 319 |
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
| 320 |
|
| 321 |
-
const llama_hparams
|
| 322 |
-
const llama_cparams
|
| 323 |
|
| 324 |
const llama_kv_cache_unified_iswa_context * mctx;
|
| 325 |
};
|
|
@@ -415,7 +424,9 @@ struct llm_graph_params {
|
|
| 415 |
(!ubatch.embd && !other.ubatch.embd)
|
| 416 |
);
|
| 417 |
|
| 418 |
-
|
|
|
|
|
|
|
| 419 |
if (!ubatch.data) {
|
| 420 |
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
| 421 |
// therefore we cannot perform the sequence id check. normally should never happen
|
|
@@ -609,6 +620,7 @@ struct llm_graph_context {
|
|
| 609 |
llm_ffn_gate_type type_gate,
|
| 610 |
int il) const;
|
| 611 |
|
|
|
|
| 612 |
ggml_tensor * build_moe_ffn(
|
| 613 |
ggml_tensor * cur,
|
| 614 |
ggml_tensor * gate_inp,
|
|
@@ -623,7 +635,29 @@ struct llm_graph_context {
|
|
| 623 |
bool scale_w,
|
| 624 |
float w_scale,
|
| 625 |
llama_expert_gating_func_type gating_op,
|
| 626 |
-
int il
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
|
| 628 |
//
|
| 629 |
// inputs
|
|
@@ -651,6 +685,7 @@ struct llm_graph_context {
|
|
| 651 |
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
| 652 |
ggml_tensor * kq_b,
|
| 653 |
ggml_tensor * kq_mask,
|
|
|
|
| 654 |
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
| 655 |
float kq_scale) const;
|
| 656 |
|
|
@@ -697,6 +732,20 @@ struct llm_graph_context {
|
|
| 697 |
float kq_scale,
|
| 698 |
int il) const;
|
| 699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
llm_graph_input_attn_cross * build_attn_inp_cross() const;
|
| 701 |
|
| 702 |
ggml_tensor * build_attn(
|
|
@@ -715,7 +764,6 @@ struct llm_graph_context {
|
|
| 715 |
// recurrent
|
| 716 |
//
|
| 717 |
|
| 718 |
-
// TODO: avoid notion of "kv"
|
| 719 |
// TODO: move this implementation to llama_memory_recurrent.
|
| 720 |
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
| 721 |
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
|
@@ -723,12 +771,13 @@ struct llm_graph_context {
|
|
| 723 |
// `llama_memory_recurrent`
|
| 724 |
ggml_tensor * build_rs(
|
| 725 |
ggml_tensor * s,
|
| 726 |
-
ggml_tensor *
|
|
|
|
| 727 |
int32_t state_size,
|
| 728 |
int32_t n_seqs,
|
| 729 |
-
uint32_t
|
| 730 |
-
uint32_t
|
| 731 |
-
uint32_t
|
| 732 |
int32_t rs_zero,
|
| 733 |
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
| 734 |
|
|
|
|
| 39 |
LLM_FFN_SWIGLU,
|
| 40 |
LLM_FFN_GEGLU,
|
| 41 |
LLM_FFN_REGLU,
|
| 42 |
+
LLM_FFN_SWIGLU_OAI_MOE,
|
| 43 |
};
|
| 44 |
|
| 45 |
enum llm_ffn_gate_type {
|
|
|
|
| 145 |
|
| 146 |
ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
|
| 147 |
|
| 148 |
+
const llama_hparams hparams;
|
| 149 |
};
|
| 150 |
|
| 151 |
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
|
|
|
| 159 |
|
| 160 |
ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
|
| 161 |
|
| 162 |
+
const llama_hparams hparams;
|
| 163 |
|
| 164 |
const llama_kv_cache_unified_context * mctx;
|
| 165 |
};
|
|
|
|
| 178 |
|
| 179 |
ggml_tensor * out_ids; // I32 [n_outputs]
|
| 180 |
|
| 181 |
+
const llama_hparams hparams;
|
| 182 |
+
const llama_cparams cparams;
|
| 183 |
|
| 184 |
const uint32_t n_outputs;
|
| 185 |
};
|
|
|
|
| 193 |
|
| 194 |
ggml_tensor * mean; // F32 [n_batch, n_batch]
|
| 195 |
|
| 196 |
+
const llama_cparams cparams;
|
| 197 |
};
|
| 198 |
|
| 199 |
class llm_graph_input_cls : public llm_graph_input_i {
|
|
|
|
| 205 |
|
| 206 |
ggml_tensor * cls; // I32 [n_batch]
|
| 207 |
|
| 208 |
+
const llama_cparams cparams;
|
| 209 |
};
|
| 210 |
|
| 211 |
class llm_graph_input_rs : public llm_graph_input_i {
|
|
|
|
| 215 |
|
| 216 |
void set_input(const llama_ubatch * ubatch) override;
|
| 217 |
|
| 218 |
+
ggml_tensor * s_copy; // I32 [n_rs]
|
| 219 |
+
|
| 220 |
+
// views of s_copy, computed once per graph
|
| 221 |
+
// and shared across layers which use build_rs
|
| 222 |
+
ggml_tensor * s_copy_main; // I32 [n_seqs]
|
| 223 |
+
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
| 224 |
|
| 225 |
const llama_memory_recurrent_context * mctx;
|
| 226 |
};
|
|
|
|
| 253 |
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
|
| 254 |
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
|
| 255 |
|
| 256 |
+
const llama_hparams hparams;
|
| 257 |
+
const llama_cparams cparams;
|
| 258 |
};
|
| 259 |
|
| 260 |
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
|
|
|
|
| 284 |
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
| 285 |
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
| 286 |
|
| 287 |
+
// note: these have to be copies because in order to be able to reuse a graph, its inputs
|
| 288 |
+
// need to carry these parameters with them. otherwise, they can point to freed
|
| 289 |
+
// llm_graph_params from a previous batch, causing stack-use-after-return
|
| 290 |
+
const llama_hparams hparams;
|
| 291 |
+
const llama_cparams cparams;
|
| 292 |
|
| 293 |
const llama_kv_cache_unified_context * mctx;
|
| 294 |
};
|
|
|
|
| 327 |
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
| 328 |
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
| 329 |
|
| 330 |
+
const llama_hparams hparams;
|
| 331 |
+
const llama_cparams cparams;
|
| 332 |
|
| 333 |
const llama_kv_cache_unified_iswa_context * mctx;
|
| 334 |
};
|
|
|
|
| 424 |
(!ubatch.embd && !other.ubatch.embd)
|
| 425 |
);
|
| 426 |
|
| 427 |
+
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
| 428 |
+
// the reason is because the set of attention streams would be different for different sequences
|
| 429 |
+
if (can_reuse_ubatch && ubatch.equal_seqs()) {
|
| 430 |
if (!ubatch.data) {
|
| 431 |
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
| 432 |
// therefore we cannot perform the sequence id check. normally should never happen
|
|
|
|
| 620 |
llm_ffn_gate_type type_gate,
|
| 621 |
int il) const;
|
| 622 |
|
| 623 |
+
// build MoE FFN without bias tensors
|
| 624 |
ggml_tensor * build_moe_ffn(
|
| 625 |
ggml_tensor * cur,
|
| 626 |
ggml_tensor * gate_inp,
|
|
|
|
| 635 |
bool scale_w,
|
| 636 |
float w_scale,
|
| 637 |
llama_expert_gating_func_type gating_op,
|
| 638 |
+
int il,
|
| 639 |
+
ggml_tensor * probs_in = nullptr) const;
|
| 640 |
+
|
| 641 |
+
ggml_tensor * build_moe_ffn(
|
| 642 |
+
ggml_tensor * cur,
|
| 643 |
+
ggml_tensor * gate_inp,
|
| 644 |
+
ggml_tensor * gate_inp_b,
|
| 645 |
+
ggml_tensor * up_exps,
|
| 646 |
+
ggml_tensor * up_exps_b,
|
| 647 |
+
ggml_tensor * gate_exps,
|
| 648 |
+
ggml_tensor * gate_exps_b,
|
| 649 |
+
ggml_tensor * down_exps,
|
| 650 |
+
ggml_tensor * down_exps_b,
|
| 651 |
+
ggml_tensor * exp_probs_b,
|
| 652 |
+
int64_t n_expert,
|
| 653 |
+
int64_t n_expert_used,
|
| 654 |
+
llm_ffn_op_type type_op,
|
| 655 |
+
bool norm_w,
|
| 656 |
+
bool scale_w,
|
| 657 |
+
float w_scale,
|
| 658 |
+
llama_expert_gating_func_type gating_op,
|
| 659 |
+
int il,
|
| 660 |
+
ggml_tensor * probs_in = nullptr) const;
|
| 661 |
|
| 662 |
//
|
| 663 |
// inputs
|
|
|
|
| 685 |
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
| 686 |
ggml_tensor * kq_b,
|
| 687 |
ggml_tensor * kq_mask,
|
| 688 |
+
ggml_tensor * sinks,
|
| 689 |
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
| 690 |
float kq_scale) const;
|
| 691 |
|
|
|
|
| 732 |
float kq_scale,
|
| 733 |
int il) const;
|
| 734 |
|
| 735 |
+
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
|
| 736 |
+
ggml_tensor * build_attn_with_sinks(
|
| 737 |
+
llm_graph_input_attn_kv_unified_iswa * inp,
|
| 738 |
+
ggml_tensor * wo,
|
| 739 |
+
ggml_tensor * wo_b,
|
| 740 |
+
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
| 741 |
+
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
| 742 |
+
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
| 743 |
+
ggml_tensor * kq_b,
|
| 744 |
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
| 745 |
+
ggml_tensor * sinks, // [n_head_q]
|
| 746 |
+
float kq_scale,
|
| 747 |
+
int il) const;
|
| 748 |
+
|
| 749 |
llm_graph_input_attn_cross * build_attn_inp_cross() const;
|
| 750 |
|
| 751 |
ggml_tensor * build_attn(
|
|
|
|
| 764 |
// recurrent
|
| 765 |
//
|
| 766 |
|
|
|
|
| 767 |
// TODO: move this implementation to llama_memory_recurrent.
|
| 768 |
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
| 769 |
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
|
|
|
| 771 |
// `llama_memory_recurrent`
|
| 772 |
ggml_tensor * build_rs(
|
| 773 |
ggml_tensor * s,
|
| 774 |
+
ggml_tensor * state_copy_main,
|
| 775 |
+
ggml_tensor * state_copy_extra,
|
| 776 |
int32_t state_size,
|
| 777 |
int32_t n_seqs,
|
| 778 |
+
uint32_t n_rs,
|
| 779 |
+
uint32_t rs_head,
|
| 780 |
+
uint32_t rs_size,
|
| 781 |
int32_t rs_zero,
|
| 782 |
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
| 783 |
|
examples/talk-llama/llama-hparams.cpp
CHANGED
|
@@ -2,9 +2,15 @@
|
|
| 2 |
|
| 3 |
#include "ggml.h"
|
| 4 |
|
| 5 |
-
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
}
|
| 9 |
}
|
| 10 |
|
|
|
|
| 2 |
|
| 3 |
#include "ggml.h"
|
| 4 |
|
| 5 |
+
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
| 6 |
+
if (dense_first) {
|
| 7 |
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
| 8 |
+
swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
|
| 9 |
+
}
|
| 10 |
+
} else {
|
| 11 |
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
| 12 |
+
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
| 13 |
+
}
|
| 14 |
}
|
| 15 |
}
|
| 16 |
|
examples/talk-llama/llama-hparams.h
CHANGED
|
@@ -9,9 +9,10 @@
|
|
| 9 |
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
| 10 |
|
| 11 |
enum llama_expert_gating_func_type {
|
| 12 |
-
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE
|
| 13 |
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX
|
| 14 |
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID
|
|
|
|
| 15 |
};
|
| 16 |
|
| 17 |
enum llama_swa_type {
|
|
@@ -73,6 +74,7 @@ struct llama_hparams {
|
|
| 73 |
bool expert_weights_norm = false;
|
| 74 |
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
| 75 |
uint32_t moe_every_n_layers = 0;
|
|
|
|
| 76 |
|
| 77 |
float f_norm_eps;
|
| 78 |
float f_norm_rms_eps;
|
|
@@ -140,7 +142,7 @@ struct llama_hparams {
|
|
| 140 |
// for Classifiers
|
| 141 |
uint32_t n_cls_out = 1;
|
| 142 |
|
| 143 |
-
// llama4
|
| 144 |
uint32_t n_moe_layer_step = 0;
|
| 145 |
uint32_t n_no_rope_layer_step = 4;
|
| 146 |
uint32_t n_attn_temp_floor_scale = 8192;
|
|
@@ -161,9 +163,10 @@ struct llama_hparams {
|
|
| 161 |
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
| 162 |
|
| 163 |
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
|
|
|
| 164 |
// note that if n_pattern == 0, all layers are SWA
|
| 165 |
// if n_pattern == 1, all layers are dense
|
| 166 |
-
// example: n_pattern = 3
|
| 167 |
// il == 0: swa
|
| 168 |
// il == 1: swa
|
| 169 |
// il == 2: dense
|
|
@@ -172,7 +175,13 @@ struct llama_hparams {
|
|
| 172 |
// il == 5: dense
|
| 173 |
// il == 6: swa
|
| 174 |
// etc ...
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
// return true if one of the layers is SWA
|
| 178 |
bool is_swa_any() const;
|
|
|
|
| 9 |
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
| 10 |
|
| 11 |
enum llama_expert_gating_func_type {
|
| 12 |
+
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
| 13 |
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
|
| 14 |
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
|
| 15 |
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
|
| 16 |
};
|
| 17 |
|
| 18 |
enum llama_swa_type {
|
|
|
|
| 74 |
bool expert_weights_norm = false;
|
| 75 |
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
| 76 |
uint32_t moe_every_n_layers = 0;
|
| 77 |
+
uint32_t nextn_predict_layers = 0;
|
| 78 |
|
| 79 |
float f_norm_eps;
|
| 80 |
float f_norm_rms_eps;
|
|
|
|
| 142 |
// for Classifiers
|
| 143 |
uint32_t n_cls_out = 1;
|
| 144 |
|
| 145 |
+
// llama4 smallthinker
|
| 146 |
uint32_t n_moe_layer_step = 0;
|
| 147 |
uint32_t n_no_rope_layer_step = 4;
|
| 148 |
uint32_t n_attn_temp_floor_scale = 8192;
|
|
|
|
| 163 |
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
| 164 |
|
| 165 |
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
| 166 |
+
// dense_first means whether the pattern is start with a dense layer
|
| 167 |
// note that if n_pattern == 0, all layers are SWA
|
| 168 |
// if n_pattern == 1, all layers are dense
|
| 169 |
+
// example 1: n_pattern = 3, dense_first = false
|
| 170 |
// il == 0: swa
|
| 171 |
// il == 1: swa
|
| 172 |
// il == 2: dense
|
|
|
|
| 175 |
// il == 5: dense
|
| 176 |
// il == 6: swa
|
| 177 |
// etc ...
|
| 178 |
+
// example 2: n_pattern = 2, dense_first = true
|
| 179 |
+
// il == 0: dense
|
| 180 |
+
// il == 1: swa
|
| 181 |
+
// il == 2: dense
|
| 182 |
+
// il == 3: swa
|
| 183 |
+
// etc ...
|
| 184 |
+
void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
|
| 185 |
|
| 186 |
// return true if one of the layers is SWA
|
| 187 |
bool is_swa_any() const;
|
examples/talk-llama/llama-kv-cache-unified-iswa.cpp
CHANGED
|
@@ -194,14 +194,20 @@ bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
|
| 194 |
return kv_base->get_size() == kv_swa->get_size();
|
| 195 |
}
|
| 196 |
|
| 197 |
-
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
| 200 |
}
|
| 201 |
|
| 202 |
-
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
| 203 |
-
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
| 205 |
}
|
| 206 |
|
| 207 |
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
|
|
|
|
| 194 |
return kv_base->get_size() == kv_swa->get_size();
|
| 195 |
}
|
| 196 |
|
| 197 |
+
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
| 198 |
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
| 199 |
+
kv_base->state_write(io, seq_id, flags);
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
kv_swa->state_write(io, seq_id, flags);
|
| 203 |
}
|
| 204 |
|
| 205 |
+
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 206 |
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
| 207 |
+
kv_base->state_read(io, seq_id, flags);
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
kv_swa->state_read(io, seq_id, flags);
|
| 211 |
}
|
| 212 |
|
| 213 |
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
|
examples/talk-llama/llama-kv-cache-unified-iswa.h
CHANGED
|
@@ -56,8 +56,8 @@ public:
|
|
| 56 |
|
| 57 |
// state write/load
|
| 58 |
|
| 59 |
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 60 |
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1)
|
| 61 |
|
| 62 |
//
|
| 63 |
// llama_kv_cache_unified_iswa specific API
|
|
|
|
| 56 |
|
| 57 |
// state write/load
|
| 58 |
|
| 59 |
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
| 60 |
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
| 61 |
|
| 62 |
//
|
| 63 |
// llama_kv_cache_unified_iswa specific API
|
examples/talk-llama/llama-kv-cache-unified.cpp
CHANGED
|
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
| 39 |
if (model.arch == LLM_ARCH_GEMMA3N) {
|
| 40 |
n_layer_cache = 20;
|
| 41 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
// create a context for each buffer type
|
| 44 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
@@ -183,7 +187,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
| 183 |
const size_t memory_size_k = size_k_bytes();
|
| 184 |
const size_t memory_size_v = size_v_bytes();
|
| 185 |
|
| 186 |
-
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%
|
| 187 |
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
|
| 188 |
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
| 189 |
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
|
@@ -193,7 +197,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
| 193 |
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
| 194 |
|
| 195 |
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
| 196 |
-
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 :
|
| 197 |
|
| 198 |
if (!supports_set_rows) {
|
| 199 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
@@ -219,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
|
|
| 219 |
}
|
| 220 |
|
| 221 |
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
| 222 |
-
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
| 223 |
-
|
| 224 |
-
auto & cells = v_cells[seq_to_stream[seq_id]];
|
| 225 |
-
auto & head = v_heads[seq_to_stream[seq_id]];
|
| 226 |
-
|
| 227 |
-
uint32_t new_head = cells.size();
|
| 228 |
|
| 229 |
if (p0 < 0) {
|
| 230 |
p0 = 0;
|
|
@@ -235,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
| 235 |
}
|
| 236 |
|
| 237 |
if (seq_id >= 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
for (uint32_t i = 0; i < cells.size(); ++i) {
|
| 239 |
if (!cells.pos_in(i, p0, p1)) {
|
| 240 |
continue;
|
|
@@ -246,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
| 246 |
}
|
| 247 |
}
|
| 248 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
} else {
|
| 250 |
// match any sequence
|
| 251 |
-
for (uint32_t
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
}
|
| 255 |
|
| 256 |
-
cells.
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
}
|
| 261 |
-
}
|
| 262 |
-
}
|
| 263 |
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
| 267 |
}
|
| 268 |
|
| 269 |
return true;
|
|
@@ -734,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
|
| 734 |
}
|
| 735 |
|
| 736 |
llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
|
| 737 |
-
if (debug > 0) {
|
| 738 |
-
const auto & cells = v_cells[seq_to_stream[1]];
|
| 739 |
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
|
| 753 |
-
|
| 754 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 755 |
} else {
|
| 756 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
}
|
| 758 |
}
|
| 759 |
-
|
| 760 |
-
ss += " *";
|
| 761 |
-
ss += '\n';
|
| 762 |
-
}
|
| 763 |
}
|
| 764 |
-
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
| 765 |
-
}
|
| 766 |
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
std::string cur;
|
| 771 |
-
if (cells.is_empty(i)) {
|
| 772 |
-
cur = '.';
|
| 773 |
-
} else {
|
| 774 |
-
cur = std::to_string(cells.pos_get(i));
|
| 775 |
-
}
|
| 776 |
-
const int n = cur.size();
|
| 777 |
-
for (int j = 0; j < 5 - n; ++j) {
|
| 778 |
-
cur += ' ';
|
| 779 |
-
}
|
| 780 |
-
ss += cur;
|
| 781 |
-
if (i%256 == 255) {
|
| 782 |
-
ss += " *";
|
| 783 |
-
}
|
| 784 |
-
if (i%64 == 63) {
|
| 785 |
-
ss += '\n';
|
| 786 |
}
|
| 787 |
-
}
|
| 788 |
-
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
| 789 |
-
}
|
| 790 |
|
| 791 |
-
|
| 792 |
-
if (cells.seq_pos_min(s) < 0) {
|
| 793 |
-
continue;
|
| 794 |
}
|
| 795 |
-
|
| 796 |
-
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
| 797 |
}
|
| 798 |
}
|
| 799 |
|
|
@@ -1808,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
| 1808 |
return false;
|
| 1809 |
}
|
| 1810 |
|
| 1811 |
-
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
|
|
|
|
|
| 1812 |
io.write(&n_stream, sizeof(n_stream));
|
| 1813 |
|
| 1814 |
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
@@ -1859,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
| 1859 |
}
|
| 1860 |
}
|
| 1861 |
|
| 1862 |
-
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
|
|
|
|
|
| 1863 |
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
| 1864 |
|
| 1865 |
uint32_t n_stream_cur;
|
|
|
|
| 39 |
if (model.arch == LLM_ARCH_GEMMA3N) {
|
| 40 |
n_layer_cache = 20;
|
| 41 |
}
|
| 42 |
+
if (model.arch == LLM_ARCH_GLM4_MOE) {
|
| 43 |
+
// GLM-4.5: Only process up to last layer, skip final NextN layer
|
| 44 |
+
n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
|
| 45 |
+
}
|
| 46 |
|
| 47 |
// create a context for each buffer type
|
| 48 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
|
|
| 187 |
const size_t memory_size_k = size_k_bytes();
|
| 188 |
const size_t memory_size_v = size_v_bytes();
|
| 189 |
|
| 190 |
+
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
| 191 |
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
|
| 192 |
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
| 193 |
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
|
|
|
| 197 |
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
| 198 |
|
| 199 |
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
| 200 |
+
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
|
| 201 |
|
| 202 |
if (!supports_set_rows) {
|
| 203 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
|
|
| 223 |
}
|
| 224 |
|
| 225 |
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
| 226 |
+
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
if (p0 < 0) {
|
| 229 |
p0 = 0;
|
|
|
|
| 234 |
}
|
| 235 |
|
| 236 |
if (seq_id >= 0) {
|
| 237 |
+
auto & cells = v_cells[seq_to_stream[seq_id]];
|
| 238 |
+
auto & head = v_heads[seq_to_stream[seq_id]];
|
| 239 |
+
|
| 240 |
+
uint32_t new_head = cells.size();
|
| 241 |
+
|
| 242 |
for (uint32_t i = 0; i < cells.size(); ++i) {
|
| 243 |
if (!cells.pos_in(i, p0, p1)) {
|
| 244 |
continue;
|
|
|
|
| 250 |
}
|
| 251 |
}
|
| 252 |
}
|
| 253 |
+
|
| 254 |
+
// If we freed up a slot, set head to it so searching can start there.
|
| 255 |
+
if (new_head != cells.size() && new_head < head) {
|
| 256 |
+
head = new_head;
|
| 257 |
+
}
|
| 258 |
} else {
|
| 259 |
// match any sequence
|
| 260 |
+
for (uint32_t s = 0; s < n_stream; ++s) {
|
| 261 |
+
auto & cells = v_cells[s];
|
| 262 |
+
auto & head = v_heads[s];
|
|
|
|
| 263 |
|
| 264 |
+
uint32_t new_head = cells.size();
|
| 265 |
|
| 266 |
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
| 267 |
+
if (!cells.pos_in(i, p0, p1)) {
|
| 268 |
+
continue;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
cells.rm(i);
|
| 272 |
+
|
| 273 |
+
if (new_head == cells.size()) {
|
| 274 |
+
new_head = i;
|
| 275 |
+
}
|
| 276 |
}
|
|
|
|
|
|
|
| 277 |
|
| 278 |
+
// If we freed up a slot, set head to it so searching can start there.
|
| 279 |
+
if (new_head != cells.size() && new_head < head) {
|
| 280 |
+
head = new_head;
|
| 281 |
+
}
|
| 282 |
+
}
|
| 283 |
}
|
| 284 |
|
| 285 |
return true;
|
|
|
|
| 750 |
}
|
| 751 |
|
| 752 |
llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
|
|
|
|
|
|
|
| 753 |
|
| 754 |
+
if (debug > 0) {
|
| 755 |
+
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
| 756 |
+
const auto seq_id = ubatch.seq_id_unq[s];
|
| 757 |
+
const auto stream_id = seq_to_stream[seq_id];
|
| 758 |
+
const auto & cells = v_cells[stream_id];
|
| 759 |
+
const uint32_t head_cur = v_heads[stream_id];
|
| 760 |
+
|
| 761 |
+
LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
|
| 762 |
+
__func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
|
| 763 |
+
|
| 764 |
+
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
| 765 |
+
std::string ss;
|
| 766 |
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
| 767 |
+
if (cells.is_empty(i)) {
|
| 768 |
+
ss += '.';
|
| 769 |
+
} else {
|
| 770 |
+
assert(cells.seq_count(i) >= 1);
|
| 771 |
|
| 772 |
+
if (cells.seq_count(i) == 1) {
|
| 773 |
+
ss += std::to_string(cells.seq_get(i));
|
| 774 |
+
} else {
|
| 775 |
+
ss += 'M';
|
| 776 |
+
}
|
| 777 |
+
}
|
| 778 |
+
if (i%256 == 255) {
|
| 779 |
+
ss += " *";
|
| 780 |
+
ss += '\n';
|
| 781 |
+
}
|
| 782 |
+
}
|
| 783 |
+
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
| 784 |
+
}
|
| 785 |
|
| 786 |
+
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
| 787 |
+
std::string ss;
|
| 788 |
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
| 789 |
+
std::string cur;
|
| 790 |
+
if (cells.is_empty(i)) {
|
| 791 |
+
cur = '.';
|
| 792 |
} else {
|
| 793 |
+
cur = std::to_string(cells.pos_get(i));
|
| 794 |
+
}
|
| 795 |
+
const int n = cur.size();
|
| 796 |
+
for (int j = 0; j < 5 - n; ++j) {
|
| 797 |
+
cur += ' ';
|
| 798 |
+
}
|
| 799 |
+
ss += cur;
|
| 800 |
+
if (i%256 == 255) {
|
| 801 |
+
ss += " *";
|
| 802 |
+
}
|
| 803 |
+
if (i%64 == 63) {
|
| 804 |
+
ss += '\n';
|
| 805 |
}
|
| 806 |
}
|
| 807 |
+
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
|
|
|
|
|
|
|
|
| 808 |
}
|
|
|
|
|
|
|
| 809 |
|
| 810 |
+
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
| 811 |
+
if (cells.seq_pos_min(s) < 0) {
|
| 812 |
+
continue;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 813 |
}
|
|
|
|
|
|
|
|
|
|
| 814 |
|
| 815 |
+
LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
|
|
|
|
|
|
| 816 |
}
|
|
|
|
|
|
|
| 817 |
}
|
| 818 |
}
|
| 819 |
|
|
|
|
| 1828 |
return false;
|
| 1829 |
}
|
| 1830 |
|
| 1831 |
+
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
| 1832 |
+
GGML_UNUSED(flags);
|
| 1833 |
+
|
| 1834 |
io.write(&n_stream, sizeof(n_stream));
|
| 1835 |
|
| 1836 |
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
|
|
| 1881 |
}
|
| 1882 |
}
|
| 1883 |
|
| 1884 |
+
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 1885 |
+
GGML_UNUSED(flags);
|
| 1886 |
+
|
| 1887 |
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
| 1888 |
|
| 1889 |
uint32_t n_stream_cur;
|
examples/talk-llama/llama-kv-cache-unified.h
CHANGED
|
@@ -136,8 +136,8 @@ public:
|
|
| 136 |
|
| 137 |
// state write/load
|
| 138 |
|
| 139 |
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 140 |
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1)
|
| 141 |
|
| 142 |
//
|
| 143 |
// llama_kv_cache_unified specific API
|
|
@@ -230,7 +230,7 @@ private:
|
|
| 230 |
|
| 231 |
// env: LLAMA_SET_ROWS (temporary)
|
| 232 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
| 233 |
-
bool supports_set_rows =
|
| 234 |
|
| 235 |
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
| 236 |
|
|
|
|
| 136 |
|
| 137 |
// state write/load
|
| 138 |
|
| 139 |
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
| 140 |
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
| 141 |
|
| 142 |
//
|
| 143 |
// llama_kv_cache_unified specific API
|
|
|
|
| 230 |
|
| 231 |
// env: LLAMA_SET_ROWS (temporary)
|
| 232 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
| 233 |
+
bool supports_set_rows = true;
|
| 234 |
|
| 235 |
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
| 236 |
|
examples/talk-llama/llama-memory-hybrid.cpp
CHANGED
|
@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
| 25 |
/* common */
|
| 26 |
uint32_t n_seq_max,
|
| 27 |
bool offload,
|
|
|
|
| 28 |
/* layer filters */
|
| 29 |
layer_filter_cb && filter_attn,
|
| 30 |
layer_filter_cb && filter_recr) :
|
|
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
| 38 |
type_v,
|
| 39 |
v_trans,
|
| 40 |
offload,
|
| 41 |
-
|
| 42 |
kv_size,
|
| 43 |
n_seq_max,
|
| 44 |
n_pad,
|
|
@@ -164,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
|
| 164 |
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
| 165 |
}
|
| 166 |
|
| 167 |
-
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
|
|
|
|
|
| 168 |
mem_attn->state_write(io, seq_id);
|
| 169 |
mem_recr->state_write(io, seq_id);
|
| 170 |
}
|
| 171 |
|
| 172 |
-
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
|
|
|
|
|
| 173 |
mem_attn->state_read(io, seq_id);
|
| 174 |
mem_recr->state_read(io, seq_id);
|
| 175 |
}
|
|
|
|
| 25 |
/* common */
|
| 26 |
uint32_t n_seq_max,
|
| 27 |
bool offload,
|
| 28 |
+
bool unified,
|
| 29 |
/* layer filters */
|
| 30 |
layer_filter_cb && filter_attn,
|
| 31 |
layer_filter_cb && filter_recr) :
|
|
|
|
| 39 |
type_v,
|
| 40 |
v_trans,
|
| 41 |
offload,
|
| 42 |
+
unified,
|
| 43 |
kv_size,
|
| 44 |
n_seq_max,
|
| 45 |
n_pad,
|
|
|
|
| 165 |
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
| 166 |
}
|
| 167 |
|
| 168 |
+
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
| 169 |
+
GGML_UNUSED(flags);
|
| 170 |
+
|
| 171 |
mem_attn->state_write(io, seq_id);
|
| 172 |
mem_recr->state_write(io, seq_id);
|
| 173 |
}
|
| 174 |
|
| 175 |
+
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 176 |
+
GGML_UNUSED(flags);
|
| 177 |
+
|
| 178 |
mem_attn->state_read(io, seq_id);
|
| 179 |
mem_recr->state_read(io, seq_id);
|
| 180 |
}
|
examples/talk-llama/llama-memory-hybrid.h
CHANGED
|
@@ -39,6 +39,7 @@ public:
|
|
| 39 |
/* common */
|
| 40 |
uint32_t n_seq_max,
|
| 41 |
bool offload,
|
|
|
|
| 42 |
/* layer filters */
|
| 43 |
layer_filter_cb && filter_attn = nullptr,
|
| 44 |
layer_filter_cb && filter_recr = nullptr);
|
|
@@ -73,8 +74,8 @@ public:
|
|
| 73 |
|
| 74 |
// state write/load
|
| 75 |
|
| 76 |
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 77 |
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
| 78 |
|
| 79 |
//
|
| 80 |
// llama_memory_hybrid specific API
|
|
|
|
| 39 |
/* common */
|
| 40 |
uint32_t n_seq_max,
|
| 41 |
bool offload,
|
| 42 |
+
bool unified,
|
| 43 |
/* layer filters */
|
| 44 |
layer_filter_cb && filter_attn = nullptr,
|
| 45 |
layer_filter_cb && filter_recr = nullptr);
|
|
|
|
| 74 |
|
| 75 |
// state write/load
|
| 76 |
|
| 77 |
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
| 78 |
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
| 79 |
|
| 80 |
//
|
| 81 |
// llama_memory_hybrid specific API
|
examples/talk-llama/llama-memory-recurrent.cpp
CHANGED
|
@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
|
|
| 680 |
return size_s_bytes;
|
| 681 |
}
|
| 682 |
|
| 683 |
-
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
|
|
|
|
|
| 684 |
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
| 685 |
uint32_t cell_count = 0;
|
| 686 |
|
|
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
| 718 |
state_write_data(io, cell_ranges);
|
| 719 |
}
|
| 720 |
|
| 721 |
-
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
|
|
|
|
|
| 722 |
uint32_t cell_count;
|
| 723 |
io.read_to(&cell_count, sizeof(cell_count));
|
| 724 |
|
|
|
|
| 680 |
return size_s_bytes;
|
| 681 |
}
|
| 682 |
|
| 683 |
+
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
| 684 |
+
GGML_UNUSED(flags);
|
| 685 |
+
|
| 686 |
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
| 687 |
uint32_t cell_count = 0;
|
| 688 |
|
|
|
|
| 720 |
state_write_data(io, cell_ranges);
|
| 721 |
}
|
| 722 |
|
| 723 |
+
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
| 724 |
+
GGML_UNUSED(flags);
|
| 725 |
+
|
| 726 |
uint32_t cell_count;
|
| 727 |
io.read_to(&cell_count, sizeof(cell_count));
|
| 728 |
|
examples/talk-llama/llama-memory-recurrent.h
CHANGED
|
@@ -63,8 +63,8 @@ public:
|
|
| 63 |
|
| 64 |
// state write/load
|
| 65 |
|
| 66 |
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 67 |
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
| 68 |
|
| 69 |
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
| 70 |
uint32_t size = 0; // total number of cells, shared across all sequences
|
|
|
|
| 63 |
|
| 64 |
// state write/load
|
| 65 |
|
| 66 |
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
| 67 |
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
| 68 |
|
| 69 |
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
| 70 |
uint32_t size = 0; // total number of cells, shared across all sequences
|
examples/talk-llama/llama-memory.h
CHANGED
|
@@ -104,8 +104,8 @@ struct llama_memory_i {
|
|
| 104 |
// state write/read
|
| 105 |
//
|
| 106 |
|
| 107 |
-
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
| 108 |
-
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
| 109 |
};
|
| 110 |
|
| 111 |
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
|
|
|
| 104 |
// state write/read
|
| 105 |
//
|
| 106 |
|
| 107 |
+
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
|
| 108 |
+
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
|
| 109 |
};
|
| 110 |
|
| 111 |
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
examples/talk-llama/llama-model-loader.cpp
CHANGED
|
@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 35 |
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
| 36 |
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
| 37 |
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
|
|
|
| 38 |
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
| 39 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
| 40 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
|
|
|
| 35 |
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
| 36 |
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
| 37 |
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
| 38 |
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
| 39 |
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
| 40 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
| 41 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
examples/talk-llama/llama-model-loader.h
CHANGED
|
@@ -58,8 +58,9 @@ struct llama_model_loader {
|
|
| 58 |
}
|
| 59 |
};
|
| 60 |
|
| 61 |
-
static const int TENSOR_NOT_REQUIRED = 1;
|
| 62 |
-
static const int TENSOR_DUPLICATED =
|
|
|
|
| 63 |
|
| 64 |
int n_kv = 0;
|
| 65 |
int n_tensors = 0;
|
|
|
|
| 58 |
}
|
| 59 |
};
|
| 60 |
|
| 61 |
+
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
| 62 |
+
static const int TENSOR_DUPLICATED = 1 << 1;
|
| 63 |
+
static const int TENSOR_SKIP = 1 << 2;
|
| 64 |
|
| 65 |
int n_kv = 0;
|
| 66 |
int n_tensors = 0;
|
examples/talk-llama/llama-model.cpp
CHANGED
|
@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
|
|
| 109 |
case LLM_TYPE_A13B: return "A13B";
|
| 110 |
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
| 111 |
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
|
|
| 112 |
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
| 113 |
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
|
|
| 114 |
case LLM_TYPE_E2B: return "E2B";
|
| 115 |
case LLM_TYPE_E4B: return "E4B";
|
| 116 |
default: return "?B";
|
|
@@ -190,6 +192,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
| 190 |
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
| 191 |
op_tensor = ggml_add(ctx, a, w);
|
| 192 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
case GGML_OP_MUL:
|
| 194 |
{
|
| 195 |
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
@@ -258,6 +267,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
| 258 |
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
| 259 |
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
| 260 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
default:
|
| 262 |
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
| 263 |
}
|
|
@@ -290,7 +303,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
| 290 |
}
|
| 291 |
|
| 292 |
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
| 293 |
-
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
| 294 |
buft_list_t buft_list;
|
| 295 |
|
| 296 |
// add ACCEL buffer types
|
|
@@ -319,21 +332,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
| 319 |
}
|
| 320 |
}
|
| 321 |
|
| 322 |
-
// add extra buffer types
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
|
|
|
| 337 |
}
|
| 338 |
}
|
| 339 |
|
|
@@ -869,6 +883,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 869 |
hparams.causal_attn = false;
|
| 870 |
}
|
| 871 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
case LLM_ARCH_QWEN2MOE:
|
| 873 |
{
|
| 874 |
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -883,6 +912,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 883 |
} break;
|
| 884 |
case LLM_ARCH_QWEN3:
|
| 885 |
{
|
|
|
|
| 886 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 887 |
switch (hparams.n_layer) {
|
| 888 |
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
@@ -1065,6 +1095,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 1065 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 1066 |
|
| 1067 |
switch (hparams.n_layer) {
|
|
|
|
| 1068 |
case 26: type = LLM_TYPE_1B; break;
|
| 1069 |
case 34: type = LLM_TYPE_4B; break;
|
| 1070 |
case 48: type = LLM_TYPE_12B; break;
|
|
@@ -1417,6 +1448,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 1417 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1418 |
}
|
| 1419 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1420 |
case LLM_ARCH_BITNET:
|
| 1421 |
{
|
| 1422 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1744,6 +1803,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 1744 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1745 |
}
|
| 1746 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1747 |
case LLM_ARCH_SMOLLM3:
|
| 1748 |
{
|
| 1749 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1754,6 +1825,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 1754 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1755 |
}
|
| 1756 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1757 |
case LLM_ARCH_LFM2:
|
| 1758 |
{
|
| 1759 |
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
@@ -1768,6 +1850,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 1768 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1769 |
}
|
| 1770 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1771 |
default: throw std::runtime_error("unsupported model architecture");
|
| 1772 |
}
|
| 1773 |
|
|
@@ -1801,7 +1906,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 1801 |
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
| 1802 |
|
| 1803 |
// build a list of buffer types for the CPU and GPU devices
|
| 1804 |
-
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
| 1805 |
for (auto * dev : devices) {
|
| 1806 |
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
| 1807 |
// add CPU buffer types as a fallback
|
|
@@ -1897,6 +2002,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 1897 |
|
| 1898 |
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
| 1899 |
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
|
|
|
| 1900 |
|
| 1901 |
// create tensors for the weights
|
| 1902 |
{
|
|
@@ -1952,7 +2058,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 1952 |
}
|
| 1953 |
|
| 1954 |
// skip unused tensors
|
| 1955 |
-
if (info.op == GGML_OP_NONE) {
|
| 1956 |
const size_t nbytes = ggml_nbytes(t_meta);
|
| 1957 |
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
| 1958 |
|
|
@@ -1962,11 +2068,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 1962 |
return nullptr;
|
| 1963 |
}
|
| 1964 |
|
| 1965 |
-
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
| 1966 |
ggml_op op;
|
| 1967 |
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
| 1968 |
if (bias) {
|
| 1969 |
-
op
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1970 |
} else {
|
| 1971 |
op = info.op;
|
| 1972 |
}
|
|
@@ -2006,7 +2116,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 2006 |
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
| 2007 |
std::regex pattern(overrides->pattern);
|
| 2008 |
if (std::regex_search(tensor_name, pattern)) {
|
| 2009 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2010 |
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
| 2011 |
tensor_name.c_str(),
|
| 2012 |
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
@@ -2126,6 +2242,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 2126 |
}
|
| 2127 |
}
|
| 2128 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2129 |
case LLM_ARCH_LLAMA4:
|
| 2130 |
{
|
| 2131 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4322,6 +4485,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 4322 |
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
| 4323 |
}
|
| 4324 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4325 |
case LLM_ARCH_NEMOTRON:
|
| 4326 |
{
|
| 4327 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5103,6 +5365,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 5103 |
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
| 5104 |
}
|
| 5105 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5106 |
case LLM_ARCH_SMOLLM3:
|
| 5107 |
{
|
| 5108 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5132,6 +5427,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 5132 |
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
| 5133 |
}
|
| 5134 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5135 |
case LLM_ARCH_LFM2:
|
| 5136 |
{
|
| 5137 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5165,6 +5500,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 5165 |
}
|
| 5166 |
}
|
| 5167 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5168 |
default:
|
| 5169 |
throw std::runtime_error("unknown architecture");
|
| 5170 |
}
|
|
@@ -5468,7 +5839,7 @@ void llama_model::print_info() const {
|
|
| 5468 |
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 5469 |
}
|
| 5470 |
|
| 5471 |
-
if (arch == LLM_ARCH_QWEN3MOE) {
|
| 5472 |
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
| 5473 |
}
|
| 5474 |
|
|
@@ -5490,6 +5861,11 @@ void llama_model::print_info() const {
|
|
| 5490 |
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
| 5491 |
}
|
| 5492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5493 |
vocab.print_info();
|
| 5494 |
}
|
| 5495 |
|
|
@@ -7978,8 +8354,10 @@ struct llm_build_dream : public llm_graph_context {
|
|
| 7978 |
}
|
| 7979 |
};
|
| 7980 |
|
| 7981 |
-
struct
|
| 7982 |
-
|
|
|
|
|
|
|
| 7983 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 7984 |
|
| 7985 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7993,10 +8371,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
| 7993 |
// inp_pos - contains the positions
|
| 7994 |
ggml_tensor * inp_pos = build_inp_pos();
|
| 7995 |
|
| 7996 |
-
|
| 7997 |
-
|
| 7998 |
-
int sections[4];
|
| 7999 |
-
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
| 8000 |
|
| 8001 |
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 8002 |
|
|
@@ -8004,34 +8380,134 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
| 8004 |
ggml_tensor * inpSA = inpL;
|
| 8005 |
|
| 8006 |
// norm
|
| 8007 |
-
cur = build_norm(inpL,
|
| 8008 |
-
model.layers[il].attn_norm, NULL,
|
| 8009 |
-
LLM_NORM_RMS, il);
|
| 8010 |
cb(cur, "attn_norm", il);
|
| 8011 |
|
| 8012 |
// self-attention
|
| 8013 |
{
|
| 8014 |
-
// compute Q
|
| 8015 |
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 8016 |
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 8017 |
-
cb(Qcur, "Qcur", il);
|
| 8018 |
-
|
| 8019 |
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
| 8020 |
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 8021 |
-
cb(Kcur, "Kcur", il);
|
| 8022 |
-
|
| 8023 |
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 8024 |
-
|
|
|
|
|
|
|
| 8025 |
cb(Vcur, "Vcur", il);
|
| 8026 |
|
| 8027 |
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
| 8028 |
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 8029 |
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 8030 |
|
| 8031 |
-
Qcur =
|
| 8032 |
-
|
| 8033 |
-
|
| 8034 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8035 |
);
|
| 8036 |
|
| 8037 |
Kcur = ggml_rope_multi(
|
|
@@ -13285,6 +13761,165 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
| 13285 |
}
|
| 13286 |
};
|
| 13287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13288 |
struct llm_build_nemotron : public llm_graph_context {
|
| 13289 |
llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
| 13290 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -16697,8 +17332,8 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
| 16697 |
}
|
| 16698 |
};
|
| 16699 |
|
| 16700 |
-
struct
|
| 16701 |
-
|
| 16702 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 16703 |
|
| 16704 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -16714,23 +17349,23 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
| 16714 |
|
| 16715 |
auto * inp_attn = build_attn_inp_kv_unified();
|
| 16716 |
|
| 16717 |
-
const float kq_scale =
|
| 16718 |
|
| 16719 |
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 16720 |
|
| 16721 |
for (int il = 0; il < n_layer; ++il) {
|
| 16722 |
ggml_tensor * inpSA = inpL;
|
| 16723 |
|
| 16724 |
-
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
| 16725 |
-
|
| 16726 |
// norm
|
| 16727 |
cur = build_norm(inpL,
|
| 16728 |
model.layers[il].attn_norm, NULL,
|
| 16729 |
LLM_NORM_RMS, il);
|
| 16730 |
cb(cur, "attn_norm", il);
|
| 16731 |
-
|
| 16732 |
// self-attention
|
| 16733 |
{
|
|
|
|
|
|
|
|
|
|
| 16734 |
// compute Q and K and RoPE them
|
| 16735 |
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 16736 |
cb(Qcur, "Qcur", il);
|
|
@@ -16757,10 +17392,148 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
| 16757 |
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 16758 |
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 16759 |
|
| 16760 |
-
|
| 16761 |
-
|
| 16762 |
-
|
| 16763 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16764 |
ext_factor, attn_factor, beta_fast, beta_slow
|
| 16765 |
);
|
| 16766 |
|
|
@@ -16834,6 +17607,136 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
| 16834 |
}
|
| 16835 |
};
|
| 16836 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16837 |
struct llm_build_lfm2 : public llm_graph_context {
|
| 16838 |
const llama_model & model;
|
| 16839 |
|
|
@@ -17011,6 +17914,127 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
| 17011 |
}
|
| 17012 |
};
|
| 17013 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17014 |
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
| 17015 |
llama_memory_i * res;
|
| 17016 |
|
|
@@ -17024,6 +18048,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
| 17024 |
case LLM_ARCH_NEO_BERT:
|
| 17025 |
case LLM_ARCH_WAVTOKENIZER_DEC:
|
| 17026 |
case LLM_ARCH_DREAM:
|
|
|
|
| 17027 |
{
|
| 17028 |
res = nullptr;
|
| 17029 |
} break;
|
|
@@ -17059,6 +18084,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
| 17059 |
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
| 17060 |
/* n_seq_max */ cparams.n_seq_max,
|
| 17061 |
/* offload */ cparams.offload_kqv,
|
|
|
|
| 17062 |
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
| 17063 |
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
| 17064 |
} else {
|
|
@@ -17190,6 +18216,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
| 17190 |
llm = std::make_unique<llm_build_dream>(*this, params);
|
| 17191 |
}
|
| 17192 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17193 |
case LLM_ARCH_QWEN2VL:
|
| 17194 |
{
|
| 17195 |
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -17332,6 +18363,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
| 17332 |
{
|
| 17333 |
llm = std::make_unique<llm_build_glm4>(*this, params);
|
| 17334 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17335 |
case LLM_ARCH_BITNET:
|
| 17336 |
{
|
| 17337 |
llm = std::make_unique<llm_build_bitnet>(*this, params);
|
|
@@ -17437,10 +18472,18 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
| 17437 |
{
|
| 17438 |
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
| 17439 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17440 |
case LLM_ARCH_SMOLLM3:
|
| 17441 |
{
|
| 17442 |
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
| 17443 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17444 |
case LLM_ARCH_FALCON_H1:
|
| 17445 |
{
|
| 17446 |
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
@@ -17449,6 +18492,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
| 17449 |
{
|
| 17450 |
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
| 17451 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17452 |
default:
|
| 17453 |
GGML_ABORT("fatal error");
|
| 17454 |
}
|
|
@@ -17478,6 +18529,7 @@ llama_model_params llama_model_default_params() {
|
|
| 17478 |
/*.use_mmap =*/ true,
|
| 17479 |
/*.use_mlock =*/ false,
|
| 17480 |
/*.check_tensors =*/ false,
|
|
|
|
| 17481 |
};
|
| 17482 |
|
| 17483 |
#ifdef GGML_USE_METAL
|
|
@@ -17580,6 +18632,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
| 17580 |
|
| 17581 |
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
| 17582 |
case LLM_ARCH_LLAMA:
|
|
|
|
| 17583 |
case LLM_ARCH_LLAMA4:
|
| 17584 |
case LLM_ARCH_DECI:
|
| 17585 |
case LLM_ARCH_BAICHUAN:
|
|
@@ -17646,7 +18699,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
| 17646 |
case LLM_ARCH_MINICPM3:
|
| 17647 |
case LLM_ARCH_DOTS1:
|
| 17648 |
case LLM_ARCH_HUNYUAN_MOE:
|
|
|
|
|
|
|
| 17649 |
case LLM_ARCH_LFM2:
|
|
|
|
|
|
|
| 17650 |
return LLAMA_ROPE_TYPE_NEOX;
|
| 17651 |
|
| 17652 |
case LLM_ARCH_QWEN2VL:
|
|
@@ -17757,6 +18814,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
|
| 17757 |
return llm_arch_is_recurrent(model->arch);
|
| 17758 |
}
|
| 17759 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17760 |
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
| 17761 |
return model->tensors_by_name;
|
| 17762 |
}
|
|
|
|
| 109 |
case LLM_TYPE_A13B: return "A13B";
|
| 110 |
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
| 111 |
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
| 112 |
+
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
| 113 |
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
| 114 |
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
| 115 |
+
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
| 116 |
case LLM_TYPE_E2B: return "E2B";
|
| 117 |
case LLM_TYPE_E4B: return "E4B";
|
| 118 |
default: return "?B";
|
|
|
|
| 192 |
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
| 193 |
op_tensor = ggml_add(ctx, a, w);
|
| 194 |
} break;
|
| 195 |
+
case GGML_OP_ADD_ID:
|
| 196 |
+
{
|
| 197 |
+
int n_expert_used = hparams.n_expert_used;
|
| 198 |
+
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
| 199 |
+
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
| 200 |
+
op_tensor = ggml_add_id(ctx, a, w, c);
|
| 201 |
+
} break;
|
| 202 |
case GGML_OP_MUL:
|
| 203 |
{
|
| 204 |
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
|
|
| 267 |
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
| 268 |
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
| 269 |
} break;
|
| 270 |
+
case GGML_OP_SCALE:
|
| 271 |
+
{
|
| 272 |
+
op_tensor = ggml_scale(ctx, w, 1.0f);
|
| 273 |
+
} break;
|
| 274 |
default:
|
| 275 |
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
| 276 |
}
|
|
|
|
| 303 |
}
|
| 304 |
|
| 305 |
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
| 306 |
+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
| 307 |
buft_list_t buft_list;
|
| 308 |
|
| 309 |
// add ACCEL buffer types
|
|
|
|
| 332 |
}
|
| 333 |
}
|
| 334 |
|
| 335 |
+
// add extra buffer types
|
| 336 |
+
if (use_extra_bufts) {
|
| 337 |
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
| 338 |
+
if (cpu_dev == nullptr) {
|
| 339 |
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
| 340 |
+
}
|
| 341 |
|
| 342 |
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
| 343 |
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
| 344 |
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
| 345 |
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
| 346 |
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
| 347 |
+
while (extra_bufts && *extra_bufts) {
|
| 348 |
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
| 349 |
+
++extra_bufts;
|
| 350 |
+
}
|
| 351 |
}
|
| 352 |
}
|
| 353 |
|
|
|
|
| 883 |
hparams.causal_attn = false;
|
| 884 |
}
|
| 885 |
break;
|
| 886 |
+
case LLM_ARCH_LLADA:
|
| 887 |
+
{
|
| 888 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 889 |
+
// LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
|
| 890 |
+
switch (hparams.n_layer) {
|
| 891 |
+
case 32:
|
| 892 |
+
type = LLM_TYPE_8B;
|
| 893 |
+
break;
|
| 894 |
+
default:
|
| 895 |
+
type = LLM_TYPE_UNKNOWN;
|
| 896 |
+
}
|
| 897 |
+
// Set non-causal attention for diffusion models
|
| 898 |
+
hparams.causal_attn = false;
|
| 899 |
+
}
|
| 900 |
+
break;
|
| 901 |
case LLM_ARCH_QWEN2MOE:
|
| 902 |
{
|
| 903 |
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
|
|
| 912 |
} break;
|
| 913 |
case LLM_ARCH_QWEN3:
|
| 914 |
{
|
| 915 |
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
| 916 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 917 |
switch (hparams.n_layer) {
|
| 918 |
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
|
|
| 1095 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 1096 |
|
| 1097 |
switch (hparams.n_layer) {
|
| 1098 |
+
case 18: type = LLM_TYPE_537M; break;
|
| 1099 |
case 26: type = LLM_TYPE_1B; break;
|
| 1100 |
case 34: type = LLM_TYPE_4B; break;
|
| 1101 |
case 48: type = LLM_TYPE_12B; break;
|
|
|
|
| 1448 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1449 |
}
|
| 1450 |
} break;
|
| 1451 |
+
case LLM_ARCH_GLM4_MOE:
|
| 1452 |
+
{
|
| 1453 |
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
| 1454 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 1455 |
+
|
| 1456 |
+
// MoE parameters
|
| 1457 |
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
| 1458 |
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
| 1459 |
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
| 1460 |
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
| 1461 |
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
| 1462 |
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
| 1463 |
+
|
| 1464 |
+
// Expert gating function (GLM-4.5 uses sigmoid)
|
| 1465 |
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
| 1466 |
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
| 1467 |
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
| 1468 |
+
}
|
| 1469 |
+
|
| 1470 |
+
// NextN/MTP parameters
|
| 1471 |
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
| 1472 |
+
|
| 1473 |
+
switch (hparams.n_layer) {
|
| 1474 |
+
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
| 1475 |
+
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
| 1476 |
+
default: type = LLM_TYPE_UNKNOWN;
|
| 1477 |
+
}
|
| 1478 |
+
} break;
|
| 1479 |
case LLM_ARCH_BITNET:
|
| 1480 |
{
|
| 1481 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
| 1803 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1804 |
}
|
| 1805 |
} break;
|
| 1806 |
+
case LLM_ARCH_HUNYUAN_DENSE:
|
| 1807 |
+
{
|
| 1808 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 1809 |
+
|
| 1810 |
+
switch (hparams.n_embd) {
|
| 1811 |
+
case 1024: type = LLM_TYPE_0_5B; break;
|
| 1812 |
+
case 2048: type = LLM_TYPE_1_8B; break;
|
| 1813 |
+
case 3072: type = LLM_TYPE_4B; break;
|
| 1814 |
+
case 4096: type = LLM_TYPE_7B; break;
|
| 1815 |
+
default: type = LLM_TYPE_UNKNOWN;
|
| 1816 |
+
}
|
| 1817 |
+
} break;
|
| 1818 |
case LLM_ARCH_SMOLLM3:
|
| 1819 |
{
|
| 1820 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
| 1825 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1826 |
}
|
| 1827 |
} break;
|
| 1828 |
+
case LLM_ARCH_OPENAI_MOE:
|
| 1829 |
+
{
|
| 1830 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 1831 |
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
| 1832 |
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
| 1833 |
+
|
| 1834 |
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
| 1835 |
+
hparams.set_swa_pattern(2);
|
| 1836 |
+
|
| 1837 |
+
// TODO: switch (hparams.n_layer)
|
| 1838 |
+
} break;
|
| 1839 |
case LLM_ARCH_LFM2:
|
| 1840 |
{
|
| 1841 |
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
|
|
| 1850 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1851 |
}
|
| 1852 |
} break;
|
| 1853 |
+
case LLM_ARCH_SMALLTHINKER:
|
| 1854 |
+
{
|
| 1855 |
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
| 1856 |
+
|
| 1857 |
+
if (found_swa && hparams.n_swa > 0) {
|
| 1858 |
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
| 1859 |
+
hparams.n_swa = 4096;
|
| 1860 |
+
hparams.set_swa_pattern(4, true);
|
| 1861 |
+
} else {
|
| 1862 |
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
| 1863 |
+
hparams.n_no_rope_layer_step = hparams.n_layer;
|
| 1864 |
+
}
|
| 1865 |
+
|
| 1866 |
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
| 1867 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 1868 |
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
| 1869 |
+
|
| 1870 |
+
switch (hparams.n_layer) {
|
| 1871 |
+
case 32: type = LLM_TYPE_4B; break;
|
| 1872 |
+
case 52: type = LLM_TYPE_20B; break;
|
| 1873 |
+
default: type = LLM_TYPE_UNKNOWN;
|
| 1874 |
+
}
|
| 1875 |
+
} break;
|
| 1876 |
default: throw std::runtime_error("unsupported model architecture");
|
| 1877 |
}
|
| 1878 |
|
|
|
|
| 1906 |
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
| 1907 |
|
| 1908 |
// build a list of buffer types for the CPU and GPU devices
|
| 1909 |
+
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
| 1910 |
for (auto * dev : devices) {
|
| 1911 |
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
| 1912 |
// add CPU buffer types as a fallback
|
|
|
|
| 2002 |
|
| 2003 |
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
| 2004 |
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
| 2005 |
+
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
| 2006 |
|
| 2007 |
// create tensors for the weights
|
| 2008 |
{
|
|
|
|
| 2058 |
}
|
| 2059 |
|
| 2060 |
// skip unused tensors
|
| 2061 |
+
if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
|
| 2062 |
const size_t nbytes = ggml_nbytes(t_meta);
|
| 2063 |
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
| 2064 |
|
|
|
|
| 2068 |
return nullptr;
|
| 2069 |
}
|
| 2070 |
|
| 2071 |
+
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
| 2072 |
ggml_op op;
|
| 2073 |
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
| 2074 |
if (bias) {
|
| 2075 |
+
if (info.op == GGML_OP_MUL_MAT_ID) {
|
| 2076 |
+
op = GGML_OP_ADD_ID;
|
| 2077 |
+
} else {
|
| 2078 |
+
op = GGML_OP_ADD;
|
| 2079 |
+
}
|
| 2080 |
} else {
|
| 2081 |
op = info.op;
|
| 2082 |
}
|
|
|
|
| 2116 |
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
| 2117 |
std::regex pattern(overrides->pattern);
|
| 2118 |
if (std::regex_search(tensor_name, pattern)) {
|
| 2119 |
+
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
| 2120 |
+
// when overriding to a CPU buffer, consider the extra buffer types
|
| 2121 |
+
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
| 2122 |
+
} else {
|
| 2123 |
+
buft = overrides->buft;
|
| 2124 |
+
}
|
| 2125 |
+
|
| 2126 |
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
| 2127 |
tensor_name.c_str(),
|
| 2128 |
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
|
|
| 2242 |
}
|
| 2243 |
}
|
| 2244 |
} break;
|
| 2245 |
+
case LLM_ARCH_LLADA:
|
| 2246 |
+
{
|
| 2247 |
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
| 2248 |
+
|
| 2249 |
+
// output
|
| 2250 |
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
| 2251 |
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
| 2252 |
+
|
| 2253 |
+
// if output is NULL, init from the input tok embed
|
| 2254 |
+
if (output == NULL) {
|
| 2255 |
+
output =
|
| 2256 |
+
create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
| 2257 |
+
}
|
| 2258 |
+
|
| 2259 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 2260 |
+
auto & layer = layers[i];
|
| 2261 |
+
|
| 2262 |
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
| 2263 |
+
|
| 2264 |
+
// Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
| 2265 |
+
layer.wq =
|
| 2266 |
+
create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
| 2267 |
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
| 2268 |
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
| 2269 |
+
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
|
| 2270 |
+
layer.wo =
|
| 2271 |
+
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
| 2272 |
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
| 2273 |
+
|
| 2274 |
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
| 2275 |
+
|
| 2276 |
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
|
| 2277 |
+
TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
| 2278 |
+
|
| 2279 |
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
| 2280 |
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
| 2281 |
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
| 2282 |
+
|
| 2283 |
+
// optional MLP bias
|
| 2284 |
+
layer.ffn_gate_b =
|
| 2285 |
+
create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
| 2286 |
+
layer.ffn_down_b =
|
| 2287 |
+
create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
| 2288 |
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
| 2289 |
+
}
|
| 2290 |
+
}
|
| 2291 |
+
break;
|
| 2292 |
case LLM_ARCH_LLAMA4:
|
| 2293 |
{
|
| 2294 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
|
| 4485 |
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
| 4486 |
}
|
| 4487 |
} break;
|
| 4488 |
+
case LLM_ARCH_GLM4_MOE:
|
| 4489 |
+
{
|
| 4490 |
+
const int64_t n_expert = hparams.n_expert;
|
| 4491 |
+
const int64_t n_expert_used = hparams.n_expert_used;
|
| 4492 |
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
| 4493 |
+
|
| 4494 |
+
GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
|
| 4495 |
+
GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
|
| 4496 |
+
|
| 4497 |
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
| 4498 |
+
|
| 4499 |
+
// output
|
| 4500 |
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
| 4501 |
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
| 4502 |
+
// if output is NULL, init from the input tok embed
|
| 4503 |
+
if (output == NULL) {
|
| 4504 |
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
| 4505 |
+
}
|
| 4506 |
+
|
| 4507 |
+
// Load ALL tensors including NextN layer to satisfy total tensor count
|
| 4508 |
+
// but only PROCESS up to last layer (skipping final NextN layer) in forward pass
|
| 4509 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 4510 |
+
int flags = 0;
|
| 4511 |
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
| 4512 |
+
// skip all tensors in the NextN layers
|
| 4513 |
+
flags |= TENSOR_SKIP;
|
| 4514 |
+
}
|
| 4515 |
+
|
| 4516 |
+
auto & layer = layers[i];
|
| 4517 |
+
|
| 4518 |
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
|
| 4519 |
+
|
| 4520 |
+
// GLM-style attention with bias terms
|
| 4521 |
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
|
| 4522 |
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
|
| 4523 |
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
|
| 4524 |
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
|
| 4525 |
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
|
| 4526 |
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
|
| 4527 |
+
|
| 4528 |
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
|
| 4529 |
+
|
| 4530 |
+
// K/Q norm tensors (optional for GLM-4.5 355B variant)
|
| 4531 |
+
layer.attn_q_norm = create_tensor(
|
| 4532 |
+
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
| 4533 |
+
layer.attn_k_norm = create_tensor(
|
| 4534 |
+
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
| 4535 |
+
|
| 4536 |
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
|
| 4537 |
+
|
| 4538 |
+
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
|
| 4539 |
+
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
|
| 4540 |
+
const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
|
| 4541 |
+
|
| 4542 |
+
if (use_moe) {
|
| 4543 |
+
// MoE layers
|
| 4544 |
+
layer.ffn_gate_inp =
|
| 4545 |
+
create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
|
| 4546 |
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
|
| 4547 |
+
|
| 4548 |
+
// MoE branch
|
| 4549 |
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
| 4550 |
+
|
| 4551 |
+
layer.ffn_gate_exps = create_tensor(
|
| 4552 |
+
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
| 4553 |
+
layer.ffn_down_exps = create_tensor(
|
| 4554 |
+
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
|
| 4555 |
+
layer.ffn_up_exps = create_tensor(
|
| 4556 |
+
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
| 4557 |
+
|
| 4558 |
+
// Shared expert
|
| 4559 |
+
if (n_expert_shared > 0) {
|
| 4560 |
+
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
|
| 4561 |
+
layer.ffn_gate_shexp = create_tensor(
|
| 4562 |
+
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
| 4563 |
+
layer.ffn_down_shexp = create_tensor(
|
| 4564 |
+
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
|
| 4565 |
+
layer.ffn_up_shexp = create_tensor(
|
| 4566 |
+
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
| 4567 |
+
}
|
| 4568 |
+
} else {
|
| 4569 |
+
// Dense layers (first k layers) - GLM uses separate gate/up projections
|
| 4570 |
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
|
| 4571 |
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
|
| 4572 |
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
|
| 4573 |
+
}
|
| 4574 |
+
|
| 4575 |
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
| 4576 |
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
| 4577 |
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
| 4578 |
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
|
| 4579 |
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
| 4580 |
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
| 4581 |
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
|
| 4582 |
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
|
| 4583 |
+
}
|
| 4584 |
+
}
|
| 4585 |
+
}
|
| 4586 |
+
break;
|
| 4587 |
case LLM_ARCH_NEMOTRON:
|
| 4588 |
{
|
| 4589 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
|
| 5365 |
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
| 5366 |
}
|
| 5367 |
} break;
|
| 5368 |
+
case LLM_ARCH_HUNYUAN_DENSE:
|
| 5369 |
+
{
|
| 5370 |
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 5371 |
+
|
| 5372 |
+
// output
|
| 5373 |
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
| 5374 |
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
| 5375 |
+
// if output is NULL, init from the input tok embed
|
| 5376 |
+
if (output == NULL) {
|
| 5377 |
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
| 5378 |
+
}
|
| 5379 |
+
|
| 5380 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 5381 |
+
auto & layer = layers[i];
|
| 5382 |
+
|
| 5383 |
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
| 5384 |
+
|
| 5385 |
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
| 5386 |
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
| 5387 |
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
| 5388 |
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
| 5389 |
+
|
| 5390 |
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
| 5391 |
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
| 5392 |
+
|
| 5393 |
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
| 5394 |
+
|
| 5395 |
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
| 5396 |
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
| 5397 |
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
| 5398 |
+
|
| 5399 |
+
}
|
| 5400 |
+
} break;
|
| 5401 |
case LLM_ARCH_SMOLLM3:
|
| 5402 |
{
|
| 5403 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
|
| 5427 |
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
| 5428 |
}
|
| 5429 |
} break;
|
| 5430 |
+
case LLM_ARCH_OPENAI_MOE:
|
| 5431 |
+
{
|
| 5432 |
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
| 5433 |
+
|
| 5434 |
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 5435 |
+
|
| 5436 |
+
// output
|
| 5437 |
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
| 5438 |
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
| 5439 |
+
|
| 5440 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 5441 |
+
auto & layer = layers[i];
|
| 5442 |
+
|
| 5443 |
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
| 5444 |
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
| 5445 |
+
|
| 5446 |
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
|
| 5447 |
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
| 5448 |
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
| 5449 |
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
|
| 5450 |
+
|
| 5451 |
+
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
|
| 5452 |
+
|
| 5453 |
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
| 5454 |
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
| 5455 |
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
| 5456 |
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
| 5457 |
+
|
| 5458 |
+
// bias
|
| 5459 |
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
|
| 5460 |
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
|
| 5461 |
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
|
| 5462 |
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
| 5463 |
+
|
| 5464 |
+
layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
|
| 5465 |
+
layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
| 5466 |
+
layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
|
| 5467 |
+
layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
| 5468 |
+
}
|
| 5469 |
+
} break;
|
| 5470 |
case LLM_ARCH_LFM2:
|
| 5471 |
{
|
| 5472 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
|
| 5500 |
}
|
| 5501 |
}
|
| 5502 |
} break;
|
| 5503 |
+
case LLM_ARCH_SMALLTHINKER:
|
| 5504 |
+
{
|
| 5505 |
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
| 5506 |
+
|
| 5507 |
+
// output
|
| 5508 |
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
| 5509 |
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
| 5510 |
+
|
| 5511 |
+
// if output is NULL, init from the input tok embed
|
| 5512 |
+
if (output == NULL) {
|
| 5513 |
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
| 5514 |
+
}
|
| 5515 |
+
|
| 5516 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 5517 |
+
auto & layer = layers[i];
|
| 5518 |
+
|
| 5519 |
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
| 5520 |
+
|
| 5521 |
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
| 5522 |
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
| 5523 |
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
| 5524 |
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
| 5525 |
+
|
| 5526 |
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
| 5527 |
+
|
| 5528 |
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
|
| 5529 |
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
|
| 5530 |
+
|
| 5531 |
+
// MoE branch
|
| 5532 |
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
| 5533 |
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
| 5534 |
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
| 5535 |
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
| 5536 |
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
| 5537 |
+
}
|
| 5538 |
+
} break;
|
| 5539 |
default:
|
| 5540 |
throw std::runtime_error("unknown architecture");
|
| 5541 |
}
|
|
|
|
| 5839 |
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 5840 |
}
|
| 5841 |
|
| 5842 |
+
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
|
| 5843 |
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
| 5844 |
}
|
| 5845 |
|
|
|
|
| 5861 |
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
| 5862 |
}
|
| 5863 |
|
| 5864 |
+
if (arch == LLM_ARCH_SMALLTHINKER) {
|
| 5865 |
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
| 5866 |
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
| 5867 |
+
}
|
| 5868 |
+
|
| 5869 |
vocab.print_info();
|
| 5870 |
}
|
| 5871 |
|
|
|
|
| 8354 |
}
|
| 8355 |
};
|
| 8356 |
|
| 8357 |
+
struct llm_build_llada : public llm_graph_context {
|
| 8358 |
+
llm_build_llada(const llama_model & model, const llm_graph_params & params) :
|
| 8359 |
+
llm_graph_context(params) {
|
| 8360 |
+
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
|
| 8361 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 8362 |
|
| 8363 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 8371 |
// inp_pos - contains the positions
|
| 8372 |
ggml_tensor * inp_pos = build_inp_pos();
|
| 8373 |
|
| 8374 |
+
// Non-causal attention for diffusion
|
| 8375 |
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
|
|
|
|
|
| 8376 |
|
| 8377 |
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 8378 |
|
|
|
|
| 8380 |
ggml_tensor * inpSA = inpL;
|
| 8381 |
|
| 8382 |
// norm
|
| 8383 |
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
|
|
|
|
|
| 8384 |
cb(cur, "attn_norm", il);
|
| 8385 |
|
| 8386 |
// self-attention
|
| 8387 |
{
|
| 8388 |
+
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
| 8389 |
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
|
|
|
|
|
|
|
|
| 8390 |
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
|
|
|
|
|
|
|
|
| 8391 |
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 8392 |
+
|
| 8393 |
+
cb(Qcur, "Qcur", il);
|
| 8394 |
+
cb(Kcur, "Kcur", il);
|
| 8395 |
cb(Vcur, "Vcur", il);
|
| 8396 |
|
| 8397 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 8398 |
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 8399 |
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 8400 |
|
| 8401 |
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 8402 |
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 8403 |
+
|
| 8404 |
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 8405 |
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 8406 |
+
|
| 8407 |
+
cb(Qcur, "Qcur", il);
|
| 8408 |
+
cb(Kcur, "Kcur", il);
|
| 8409 |
+
cb(Vcur, "Vcur", il);
|
| 8410 |
+
|
| 8411 |
+
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
|
| 8412 |
+
1.0f / sqrtf(float(n_embd_head)), il);
|
| 8413 |
+
}
|
| 8414 |
+
|
| 8415 |
+
if (il == n_layer - 1 && inp_out_ids) {
|
| 8416 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 8417 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 8418 |
+
}
|
| 8419 |
+
|
| 8420 |
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 8421 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 8422 |
+
|
| 8423 |
+
// feed-forward network
|
| 8424 |
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
| 8425 |
+
cb(cur, "ffn_norm", il);
|
| 8426 |
+
|
| 8427 |
+
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
|
| 8428 |
+
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
| 8429 |
+
cb(cur, "ffn_out", il);
|
| 8430 |
+
|
| 8431 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 8432 |
+
|
| 8433 |
+
cur = build_cvec(cur, il);
|
| 8434 |
+
cb(cur, "l_out", il);
|
| 8435 |
+
|
| 8436 |
+
// input for next layer
|
| 8437 |
+
inpL = cur;
|
| 8438 |
+
}
|
| 8439 |
+
|
| 8440 |
+
cur = inpL;
|
| 8441 |
+
|
| 8442 |
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
| 8443 |
+
|
| 8444 |
+
cb(cur, "result_norm", -1);
|
| 8445 |
+
res->t_embd = cur;
|
| 8446 |
+
|
| 8447 |
+
// lm_head
|
| 8448 |
+
cur = build_lora_mm(model.output, cur);
|
| 8449 |
+
|
| 8450 |
+
cb(cur, "result_output", -1);
|
| 8451 |
+
res->t_logits = cur;
|
| 8452 |
+
|
| 8453 |
+
ggml_build_forward_expand(gf, cur);
|
| 8454 |
+
}
|
| 8455 |
+
};
|
| 8456 |
+
|
| 8457 |
+
struct llm_build_qwen2vl : public llm_graph_context {
|
| 8458 |
+
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
| 8459 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 8460 |
+
|
| 8461 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 8462 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 8463 |
+
|
| 8464 |
+
ggml_tensor * cur;
|
| 8465 |
+
ggml_tensor * inpL;
|
| 8466 |
+
|
| 8467 |
+
inpL = build_inp_embd(model.tok_embd);
|
| 8468 |
+
|
| 8469 |
+
// inp_pos - contains the positions
|
| 8470 |
+
ggml_tensor * inp_pos = build_inp_pos();
|
| 8471 |
+
|
| 8472 |
+
auto * inp_attn = build_attn_inp_kv_unified();
|
| 8473 |
+
|
| 8474 |
+
int sections[4];
|
| 8475 |
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
| 8476 |
+
|
| 8477 |
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 8478 |
+
|
| 8479 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 8480 |
+
ggml_tensor * inpSA = inpL;
|
| 8481 |
+
|
| 8482 |
+
// norm
|
| 8483 |
+
cur = build_norm(inpL,
|
| 8484 |
+
model.layers[il].attn_norm, NULL,
|
| 8485 |
+
LLM_NORM_RMS, il);
|
| 8486 |
+
cb(cur, "attn_norm", il);
|
| 8487 |
+
|
| 8488 |
+
// self-attention
|
| 8489 |
+
{
|
| 8490 |
+
// compute Q and K and RoPE them
|
| 8491 |
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 8492 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 8493 |
+
cb(Qcur, "Qcur", il);
|
| 8494 |
+
|
| 8495 |
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
| 8496 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 8497 |
+
cb(Kcur, "Kcur", il);
|
| 8498 |
+
|
| 8499 |
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 8500 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 8501 |
+
cb(Vcur, "Vcur", il);
|
| 8502 |
+
|
| 8503 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 8504 |
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 8505 |
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 8506 |
+
|
| 8507 |
+
Qcur = ggml_rope_multi(
|
| 8508 |
+
ctx0, Qcur, inp_pos, nullptr,
|
| 8509 |
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 8510 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 8511 |
);
|
| 8512 |
|
| 8513 |
Kcur = ggml_rope_multi(
|
|
|
|
| 13761 |
}
|
| 13762 |
};
|
| 13763 |
|
| 13764 |
+
struct llm_build_glm4_moe : public llm_graph_context {
|
| 13765 |
+
llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
| 13766 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 13767 |
+
|
| 13768 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 13769 |
+
|
| 13770 |
+
ggml_tensor * cur;
|
| 13771 |
+
ggml_tensor * inpL;
|
| 13772 |
+
|
| 13773 |
+
inpL = build_inp_embd(model.tok_embd);
|
| 13774 |
+
|
| 13775 |
+
// inp_pos - contains the positions
|
| 13776 |
+
ggml_tensor * inp_pos = build_inp_pos();
|
| 13777 |
+
|
| 13778 |
+
auto * inp_attn = build_attn_inp_kv_unified();
|
| 13779 |
+
|
| 13780 |
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 13781 |
+
|
| 13782 |
+
// Only process up to last layer (skip final NextN layer)
|
| 13783 |
+
// Final layer tensors are loaded but not processed in forward pass
|
| 13784 |
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
| 13785 |
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
| 13786 |
+
ggml_tensor * inpSA = inpL;
|
| 13787 |
+
|
| 13788 |
+
// Pre-attention norm
|
| 13789 |
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
| 13790 |
+
cb(cur, "attn_norm", il);
|
| 13791 |
+
|
| 13792 |
+
// self-attention
|
| 13793 |
+
{
|
| 13794 |
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 13795 |
+
if (model.layers[il].bq) {
|
| 13796 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 13797 |
+
}
|
| 13798 |
+
cb(Qcur, "Qcur", il);
|
| 13799 |
+
|
| 13800 |
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
| 13801 |
+
if (model.layers[il].bk) {
|
| 13802 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 13803 |
+
}
|
| 13804 |
+
cb(Kcur, "Kcur", il);
|
| 13805 |
+
|
| 13806 |
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 13807 |
+
if (model.layers[il].bv) {
|
| 13808 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 13809 |
+
}
|
| 13810 |
+
cb(Vcur, "Vcur", il);
|
| 13811 |
+
|
| 13812 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 13813 |
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 13814 |
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 13815 |
+
|
| 13816 |
+
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
| 13817 |
+
if (model.layers[il].attn_q_norm) {
|
| 13818 |
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
| 13819 |
+
cb(Qcur, "Qcur_normed", il);
|
| 13820 |
+
}
|
| 13821 |
+
if (model.layers[il].attn_k_norm) {
|
| 13822 |
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
| 13823 |
+
cb(Kcur, "Kcur_normed", il);
|
| 13824 |
+
}
|
| 13825 |
+
|
| 13826 |
+
Qcur = ggml_rope_ext(
|
| 13827 |
+
ctx0, Qcur, inp_pos, nullptr,
|
| 13828 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 13829 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 13830 |
+
);
|
| 13831 |
+
|
| 13832 |
+
Kcur = ggml_rope_ext(
|
| 13833 |
+
ctx0, Kcur, inp_pos, nullptr,
|
| 13834 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 13835 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 13836 |
+
);
|
| 13837 |
+
|
| 13838 |
+
cb(Qcur, "Qcur", il);
|
| 13839 |
+
cb(Kcur, "Kcur", il);
|
| 13840 |
+
cb(Vcur, "Vcur", il);
|
| 13841 |
+
|
| 13842 |
+
cur = build_attn(inp_attn,
|
| 13843 |
+
model.layers[il].wo, NULL,
|
| 13844 |
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
| 13845 |
+
}
|
| 13846 |
+
|
| 13847 |
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
| 13848 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 13849 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 13850 |
+
}
|
| 13851 |
+
|
| 13852 |
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 13853 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 13854 |
+
|
| 13855 |
+
// Post-attention norm
|
| 13856 |
+
cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
| 13857 |
+
cb(cur, "post_attn_norm", il);
|
| 13858 |
+
|
| 13859 |
+
// Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
|
| 13860 |
+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
| 13861 |
+
// Dense FFN layer
|
| 13862 |
+
cur = build_ffn(cur,
|
| 13863 |
+
model.layers[il].ffn_up, NULL, NULL,
|
| 13864 |
+
model.layers[il].ffn_gate, NULL, NULL,
|
| 13865 |
+
model.layers[il].ffn_down, NULL, NULL,
|
| 13866 |
+
NULL,
|
| 13867 |
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
| 13868 |
+
cb(cur, "ffn_out", il);
|
| 13869 |
+
} else {
|
| 13870 |
+
// Process routed experts using existing MoE infrastructure
|
| 13871 |
+
ggml_tensor * routed_out = build_moe_ffn(cur,
|
| 13872 |
+
model.layers[il].ffn_gate_inp,
|
| 13873 |
+
model.layers[il].ffn_up_exps,
|
| 13874 |
+
model.layers[il].ffn_gate_exps,
|
| 13875 |
+
model.layers[il].ffn_down_exps,
|
| 13876 |
+
model.layers[il].ffn_exp_probs_b,
|
| 13877 |
+
n_expert, n_expert_used,
|
| 13878 |
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
| 13879 |
+
true, hparams.expert_weights_scale,
|
| 13880 |
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
| 13881 |
+
il);
|
| 13882 |
+
cb(routed_out, "ffn_moe_out", il);
|
| 13883 |
+
|
| 13884 |
+
// Process shared expert on original input
|
| 13885 |
+
ggml_tensor * shared_out = build_ffn(cur,
|
| 13886 |
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
| 13887 |
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
| 13888 |
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
| 13889 |
+
NULL,
|
| 13890 |
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
| 13891 |
+
cb(shared_out, "ffn_shexp_out", il);
|
| 13892 |
+
|
| 13893 |
+
// Final output: routed_output + shared_output
|
| 13894 |
+
cur = ggml_add(ctx0, routed_out, shared_out);
|
| 13895 |
+
cb(cur, "ffn_out", il);
|
| 13896 |
+
}
|
| 13897 |
+
|
| 13898 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 13899 |
+
|
| 13900 |
+
cur = build_cvec(cur, il);
|
| 13901 |
+
cb(cur, "l_out", il);
|
| 13902 |
+
|
| 13903 |
+
// input for next layer
|
| 13904 |
+
inpL = cur;
|
| 13905 |
+
}
|
| 13906 |
+
|
| 13907 |
+
cur = inpL;
|
| 13908 |
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
| 13909 |
+
|
| 13910 |
+
cb(cur, "result_norm", -1);
|
| 13911 |
+
res->t_embd = cur;
|
| 13912 |
+
|
| 13913 |
+
// lm_head
|
| 13914 |
+
cur = build_lora_mm(model.output, cur);
|
| 13915 |
+
|
| 13916 |
+
cb(cur, "result_output", -1);
|
| 13917 |
+
res->t_logits = cur;
|
| 13918 |
+
|
| 13919 |
+
ggml_build_forward_expand(gf, cur);
|
| 13920 |
+
}
|
| 13921 |
+
};
|
| 13922 |
+
|
| 13923 |
struct llm_build_nemotron : public llm_graph_context {
|
| 13924 |
llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
| 13925 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
|
|
| 17332 |
}
|
| 17333 |
};
|
| 17334 |
|
| 17335 |
+
struct llm_build_hunyuan_dense : public llm_graph_context {
|
| 17336 |
+
llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
| 17337 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 17338 |
|
| 17339 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 17349 |
|
| 17350 |
auto * inp_attn = build_attn_inp_kv_unified();
|
| 17351 |
|
| 17352 |
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
| 17353 |
|
| 17354 |
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 17355 |
|
| 17356 |
for (int il = 0; il < n_layer; ++il) {
|
| 17357 |
ggml_tensor * inpSA = inpL;
|
| 17358 |
|
|
|
|
|
|
|
| 17359 |
// norm
|
| 17360 |
cur = build_norm(inpL,
|
| 17361 |
model.layers[il].attn_norm, NULL,
|
| 17362 |
LLM_NORM_RMS, il);
|
| 17363 |
cb(cur, "attn_norm", il);
|
|
|
|
| 17364 |
// self-attention
|
| 17365 |
{
|
| 17366 |
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
| 17367 |
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
| 17368 |
+
|
| 17369 |
// compute Q and K and RoPE them
|
| 17370 |
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 17371 |
cb(Qcur, "Qcur", il);
|
|
|
|
| 17392 |
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 17393 |
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 17394 |
|
| 17395 |
+
Qcur = ggml_rope_ext(
|
| 17396 |
+
ctx0, Qcur, inp_pos, rope_factors,
|
| 17397 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 17398 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 17399 |
+
);
|
| 17400 |
+
|
| 17401 |
+
cb(Qcur, "Qcur", il);
|
| 17402 |
+
cb(Kcur, "Kcur", il);
|
| 17403 |
+
cb(Vcur, "Vcur", il);
|
| 17404 |
+
|
| 17405 |
+
Kcur = ggml_rope_ext(
|
| 17406 |
+
ctx0, Kcur, inp_pos, rope_factors,
|
| 17407 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 17408 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 17409 |
+
);
|
| 17410 |
+
|
| 17411 |
+
Kcur = build_norm(Kcur,
|
| 17412 |
+
model.layers[il].attn_k_norm, nullptr,
|
| 17413 |
+
LLM_NORM_RMS, il);
|
| 17414 |
+
cb(Kcur, "Kcur_norm", il);
|
| 17415 |
+
|
| 17416 |
+
Qcur = build_norm(Qcur,
|
| 17417 |
+
model.layers[il].attn_q_norm, nullptr,
|
| 17418 |
+
LLM_NORM_RMS, il);
|
| 17419 |
+
cb(Qcur, "Qcur_norm", il);
|
| 17420 |
+
|
| 17421 |
+
cur = build_attn(inp_attn,
|
| 17422 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 17423 |
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
| 17424 |
+
cb(cur, "attn_out", il);
|
| 17425 |
+
}
|
| 17426 |
+
|
| 17427 |
+
if (il == n_layer - 1 && inp_out_ids) {
|
| 17428 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 17429 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 17430 |
+
}
|
| 17431 |
+
|
| 17432 |
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 17433 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 17434 |
+
|
| 17435 |
+
cur = build_norm(ffn_inp,
|
| 17436 |
+
model.layers[il].ffn_norm, NULL,
|
| 17437 |
+
LLM_NORM_RMS, il);
|
| 17438 |
+
cb(cur, "ffn_norm", il);
|
| 17439 |
+
// feed-forward network (non-MoE)
|
| 17440 |
+
ggml_tensor * cur_mlp = build_ffn(cur,
|
| 17441 |
+
model.layers[il].ffn_up, NULL, NULL,
|
| 17442 |
+
model.layers[il].ffn_gate, NULL, NULL,
|
| 17443 |
+
model.layers[il].ffn_down, NULL, NULL,
|
| 17444 |
+
NULL,
|
| 17445 |
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
| 17446 |
+
cb(cur_mlp, "ffn_out", il);
|
| 17447 |
+
|
| 17448 |
+
cur = ggml_add(ctx0, cur_mlp, ffn_inp);
|
| 17449 |
+
|
| 17450 |
+
cur = build_cvec(cur, il);
|
| 17451 |
+
cb(cur, "l_out", il);
|
| 17452 |
+
|
| 17453 |
+
// input for next layer
|
| 17454 |
+
inpL = cur;
|
| 17455 |
+
}
|
| 17456 |
+
cur = inpL;
|
| 17457 |
+
|
| 17458 |
+
cur = build_norm(cur,
|
| 17459 |
+
model.output_norm, NULL,
|
| 17460 |
+
LLM_NORM_RMS, -1);
|
| 17461 |
+
|
| 17462 |
+
cb(cur, "result_norm", -1);
|
| 17463 |
+
res->t_embd = cur;
|
| 17464 |
+
// lm_head
|
| 17465 |
+
cur = build_lora_mm(model.output, cur);
|
| 17466 |
+
cb(cur, "result_output", -1);
|
| 17467 |
+
res->t_logits = cur;
|
| 17468 |
+
|
| 17469 |
+
ggml_build_forward_expand(gf, cur);
|
| 17470 |
+
}
|
| 17471 |
+
};
|
| 17472 |
+
|
| 17473 |
+
struct llm_build_smollm3 : public llm_graph_context {
|
| 17474 |
+
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
| 17475 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 17476 |
+
|
| 17477 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 17478 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 17479 |
+
|
| 17480 |
+
ggml_tensor * cur;
|
| 17481 |
+
ggml_tensor * inpL;
|
| 17482 |
+
|
| 17483 |
+
inpL = build_inp_embd(model.tok_embd);
|
| 17484 |
+
|
| 17485 |
+
// inp_pos - contains the positions
|
| 17486 |
+
ggml_tensor * inp_pos = build_inp_pos();
|
| 17487 |
+
|
| 17488 |
+
auto * inp_attn = build_attn_inp_kv_unified();
|
| 17489 |
+
|
| 17490 |
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
| 17491 |
+
|
| 17492 |
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 17493 |
+
|
| 17494 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 17495 |
+
ggml_tensor * inpSA = inpL;
|
| 17496 |
+
|
| 17497 |
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
| 17498 |
+
|
| 17499 |
+
// norm
|
| 17500 |
+
cur = build_norm(inpL,
|
| 17501 |
+
model.layers[il].attn_norm, NULL,
|
| 17502 |
+
LLM_NORM_RMS, il);
|
| 17503 |
+
cb(cur, "attn_norm", il);
|
| 17504 |
+
|
| 17505 |
+
// self-attention
|
| 17506 |
+
{
|
| 17507 |
+
// compute Q and K and RoPE them
|
| 17508 |
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 17509 |
+
cb(Qcur, "Qcur", il);
|
| 17510 |
+
if (model.layers[il].bq) {
|
| 17511 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 17512 |
+
cb(Qcur, "Qcur", il);
|
| 17513 |
+
}
|
| 17514 |
+
|
| 17515 |
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
| 17516 |
+
cb(Kcur, "Kcur", il);
|
| 17517 |
+
if (model.layers[il].bk) {
|
| 17518 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 17519 |
+
cb(Kcur, "Kcur", il);
|
| 17520 |
+
}
|
| 17521 |
+
|
| 17522 |
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 17523 |
+
cb(Vcur, "Vcur", il);
|
| 17524 |
+
if (model.layers[il].bv) {
|
| 17525 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 17526 |
+
cb(Vcur, "Vcur", il);
|
| 17527 |
+
}
|
| 17528 |
+
|
| 17529 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 17530 |
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 17531 |
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 17532 |
+
|
| 17533 |
+
if (use_rope) {
|
| 17534 |
+
Qcur = ggml_rope_ext(
|
| 17535 |
+
ctx0, Qcur, inp_pos, nullptr,
|
| 17536 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 17537 |
ext_factor, attn_factor, beta_fast, beta_slow
|
| 17538 |
);
|
| 17539 |
|
|
|
|
| 17607 |
}
|
| 17608 |
};
|
| 17609 |
|
| 17610 |
+
struct llm_build_openai_moe_iswa : public llm_graph_context {
|
| 17611 |
+
llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
| 17612 |
+
ggml_tensor * cur;
|
| 17613 |
+
ggml_tensor * inpL;
|
| 17614 |
+
|
| 17615 |
+
inpL = build_inp_embd(model.tok_embd);
|
| 17616 |
+
|
| 17617 |
+
// inp_pos - contains the positions
|
| 17618 |
+
ggml_tensor * inp_pos = build_inp_pos();
|
| 17619 |
+
|
| 17620 |
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
| 17621 |
+
|
| 17622 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 17623 |
+
ggml_tensor * inpSA = inpL;
|
| 17624 |
+
|
| 17625 |
+
// norm
|
| 17626 |
+
cur = build_norm(inpL,
|
| 17627 |
+
model.layers[il].attn_norm, nullptr,
|
| 17628 |
+
LLM_NORM_RMS, il);
|
| 17629 |
+
cb(cur, "attn_norm", il);
|
| 17630 |
+
|
| 17631 |
+
// self-attention
|
| 17632 |
+
{
|
| 17633 |
+
// compute Q and K and RoPE them
|
| 17634 |
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 17635 |
+
cb(Qcur, "Qcur", il);
|
| 17636 |
+
if (model.layers[il].bq) {
|
| 17637 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 17638 |
+
cb(Qcur, "Qcur", il);
|
| 17639 |
+
}
|
| 17640 |
+
|
| 17641 |
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
| 17642 |
+
cb(Kcur, "Kcur", il);
|
| 17643 |
+
if (model.layers[il].bk) {
|
| 17644 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 17645 |
+
cb(Kcur, "Kcur", il);
|
| 17646 |
+
}
|
| 17647 |
+
|
| 17648 |
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 17649 |
+
cb(Vcur, "Vcur", il);
|
| 17650 |
+
if (model.layers[il].bv) {
|
| 17651 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 17652 |
+
cb(Vcur, "Vcur", il);
|
| 17653 |
+
}
|
| 17654 |
+
|
| 17655 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
| 17656 |
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
| 17657 |
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
| 17658 |
+
|
| 17659 |
+
Qcur = ggml_rope_ext(
|
| 17660 |
+
ctx0, Qcur, inp_pos, nullptr,
|
| 17661 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 17662 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 17663 |
+
);
|
| 17664 |
+
|
| 17665 |
+
Kcur = ggml_rope_ext(
|
| 17666 |
+
ctx0, Kcur, inp_pos, nullptr,
|
| 17667 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 17668 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 17669 |
+
);
|
| 17670 |
+
|
| 17671 |
+
cb(Qcur, "Qcur", il);
|
| 17672 |
+
cb(Kcur, "Kcur", il);
|
| 17673 |
+
cb(Vcur, "Vcur", il);
|
| 17674 |
+
|
| 17675 |
+
cur = build_attn_with_sinks(inp_attn,
|
| 17676 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 17677 |
+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
|
| 17678 |
+
|
| 17679 |
+
cb(cur, "attn_out", il);
|
| 17680 |
+
}
|
| 17681 |
+
|
| 17682 |
+
if (il == n_layer - 1) {
|
| 17683 |
+
// skip computing output for unused tokens
|
| 17684 |
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 17685 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 17686 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 17687 |
+
}
|
| 17688 |
+
|
| 17689 |
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 17690 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 17691 |
+
|
| 17692 |
+
cur = ffn_inp;
|
| 17693 |
+
cur = build_norm(cur,
|
| 17694 |
+
model.layers[il].attn_post_norm, nullptr,
|
| 17695 |
+
LLM_NORM_RMS, il);
|
| 17696 |
+
cb(cur, "attn_post_norm", il);
|
| 17697 |
+
|
| 17698 |
+
// MoE branch
|
| 17699 |
+
cur = build_moe_ffn(cur,
|
| 17700 |
+
model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
|
| 17701 |
+
model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
|
| 17702 |
+
model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
|
| 17703 |
+
model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
|
| 17704 |
+
nullptr,
|
| 17705 |
+
n_expert, n_expert_used,
|
| 17706 |
+
LLM_FFN_SWIGLU_OAI_MOE, false,
|
| 17707 |
+
false, 0.0,
|
| 17708 |
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
| 17709 |
+
il);
|
| 17710 |
+
cb(cur, "ffn_moe_out", il);
|
| 17711 |
+
|
| 17712 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 17713 |
+
|
| 17714 |
+
cur = build_cvec(cur, il);
|
| 17715 |
+
cb(cur, "l_out", il);
|
| 17716 |
+
|
| 17717 |
+
// input for next layer
|
| 17718 |
+
inpL = cur;
|
| 17719 |
+
}
|
| 17720 |
+
|
| 17721 |
+
cur = inpL;
|
| 17722 |
+
|
| 17723 |
+
cur = build_norm(cur,
|
| 17724 |
+
model.output_norm, NULL,
|
| 17725 |
+
LLM_NORM_RMS, -1);
|
| 17726 |
+
|
| 17727 |
+
cb(cur, "result_norm", -1);
|
| 17728 |
+
res->t_embd = cur;
|
| 17729 |
+
|
| 17730 |
+
// lm_head
|
| 17731 |
+
cur = build_lora_mm(model.output, cur);
|
| 17732 |
+
|
| 17733 |
+
cb(cur, "result_output", -1);
|
| 17734 |
+
res->t_logits = cur;
|
| 17735 |
+
|
| 17736 |
+
ggml_build_forward_expand(gf, cur);
|
| 17737 |
+
}
|
| 17738 |
+
};
|
| 17739 |
+
|
| 17740 |
struct llm_build_lfm2 : public llm_graph_context {
|
| 17741 |
const llama_model & model;
|
| 17742 |
|
|
|
|
| 17914 |
}
|
| 17915 |
};
|
| 17916 |
|
| 17917 |
+
template <bool iswa>
|
| 17918 |
+
struct llm_build_smallthinker : public llm_graph_context{
|
| 17919 |
+
llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
|
| 17920 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 17921 |
+
|
| 17922 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 17923 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 17924 |
+
|
| 17925 |
+
ggml_tensor * cur;
|
| 17926 |
+
ggml_tensor * inpL;
|
| 17927 |
+
|
| 17928 |
+
inpL = build_inp_embd(model.tok_embd);
|
| 17929 |
+
|
| 17930 |
+
// inp_pos - contains the positions
|
| 17931 |
+
ggml_tensor * inp_pos = build_inp_pos();
|
| 17932 |
+
|
| 17933 |
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
| 17934 |
+
inp_attn_type * inp_attn = nullptr;
|
| 17935 |
+
|
| 17936 |
+
if constexpr (iswa) {
|
| 17937 |
+
inp_attn = build_attn_inp_kv_unified_iswa();
|
| 17938 |
+
} else {
|
| 17939 |
+
inp_attn = build_attn_inp_kv_unified();
|
| 17940 |
+
}
|
| 17941 |
+
|
| 17942 |
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 17943 |
+
|
| 17944 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 17945 |
+
ggml_tensor * inpSA = inpL;
|
| 17946 |
+
ggml_tensor * probs = nullptr;
|
| 17947 |
+
|
| 17948 |
+
probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
| 17949 |
+
cb(probs, "ffn_moe_logits", il);
|
| 17950 |
+
|
| 17951 |
+
// norm
|
| 17952 |
+
cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
| 17953 |
+
cb(cur, "attn_norm", il);
|
| 17954 |
+
|
| 17955 |
+
// self_attention
|
| 17956 |
+
{
|
| 17957 |
+
// compute Q and K and RoPE them
|
| 17958 |
+
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 17959 |
+
cb(Qcur, "Qcur", il);
|
| 17960 |
+
|
| 17961 |
+
struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
| 17962 |
+
cb(Kcur, "Kcur", il);
|
| 17963 |
+
|
| 17964 |
+
struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 17965 |
+
cb(Vcur, "Vcur", il);
|
| 17966 |
+
|
| 17967 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 17968 |
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 17969 |
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 17970 |
+
|
| 17971 |
+
if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
|
| 17972 |
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 17973 |
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 17974 |
+
|
| 17975 |
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 17976 |
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 17977 |
+
}
|
| 17978 |
+
|
| 17979 |
+
cb(Qcur, "Qcur", il);
|
| 17980 |
+
cb(Kcur, "Kcur", il);
|
| 17981 |
+
|
| 17982 |
+
cur = build_attn(inp_attn,
|
| 17983 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 17984 |
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
| 17985 |
+
}
|
| 17986 |
+
|
| 17987 |
+
if (il == n_layer - 1 && inp_out_ids) {
|
| 17988 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 17989 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 17990 |
+
probs = ggml_get_rows(ctx0, probs, inp_out_ids);
|
| 17991 |
+
}
|
| 17992 |
+
|
| 17993 |
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 17994 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 17995 |
+
|
| 17996 |
+
// MoE branch
|
| 17997 |
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
| 17998 |
+
cb(cur, "ffn_norm", il);
|
| 17999 |
+
|
| 18000 |
+
ggml_tensor * ffn_out =
|
| 18001 |
+
build_moe_ffn(cur,
|
| 18002 |
+
nullptr,
|
| 18003 |
+
model.layers[il].ffn_up_exps,
|
| 18004 |
+
model.layers[il].ffn_gate_exps,
|
| 18005 |
+
model.layers[il].ffn_down_exps,
|
| 18006 |
+
nullptr,
|
| 18007 |
+
n_expert, n_expert_used,
|
| 18008 |
+
LLM_FFN_RELU, true,
|
| 18009 |
+
false, 0.0,
|
| 18010 |
+
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
| 18011 |
+
il, probs);
|
| 18012 |
+
|
| 18013 |
+
cb(ffn_out, "ffn_out", il);
|
| 18014 |
+
cur = ffn_out;
|
| 18015 |
+
|
| 18016 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 18017 |
+
cur = build_cvec(cur, il);
|
| 18018 |
+
cb(cur, "l_out", il);
|
| 18019 |
+
|
| 18020 |
+
// input for next layer
|
| 18021 |
+
inpL = cur;
|
| 18022 |
+
}
|
| 18023 |
+
|
| 18024 |
+
cur = inpL;
|
| 18025 |
+
|
| 18026 |
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
| 18027 |
+
cb(cur, "result_norm", -1);
|
| 18028 |
+
|
| 18029 |
+
// lm_head
|
| 18030 |
+
cur = build_lora_mm(model.output, cur);
|
| 18031 |
+
cb(cur, "result_output", -1);
|
| 18032 |
+
res->t_logits = cur;
|
| 18033 |
+
|
| 18034 |
+
ggml_build_forward_expand(gf, cur);
|
| 18035 |
+
}
|
| 18036 |
+
};
|
| 18037 |
+
|
| 18038 |
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
| 18039 |
llama_memory_i * res;
|
| 18040 |
|
|
|
|
| 18048 |
case LLM_ARCH_NEO_BERT:
|
| 18049 |
case LLM_ARCH_WAVTOKENIZER_DEC:
|
| 18050 |
case LLM_ARCH_DREAM:
|
| 18051 |
+
case LLM_ARCH_LLADA:
|
| 18052 |
{
|
| 18053 |
res = nullptr;
|
| 18054 |
} break;
|
|
|
|
| 18084 |
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
| 18085 |
/* n_seq_max */ cparams.n_seq_max,
|
| 18086 |
/* offload */ cparams.offload_kqv,
|
| 18087 |
+
/* unified */ cparams.kv_unified,
|
| 18088 |
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
| 18089 |
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
| 18090 |
} else {
|
|
|
|
| 18216 |
llm = std::make_unique<llm_build_dream>(*this, params);
|
| 18217 |
}
|
| 18218 |
break;
|
| 18219 |
+
case LLM_ARCH_LLADA:
|
| 18220 |
+
{
|
| 18221 |
+
llm = std::make_unique<llm_build_llada>(*this, params);
|
| 18222 |
+
}
|
| 18223 |
+
break;
|
| 18224 |
case LLM_ARCH_QWEN2VL:
|
| 18225 |
{
|
| 18226 |
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
|
|
| 18363 |
{
|
| 18364 |
llm = std::make_unique<llm_build_glm4>(*this, params);
|
| 18365 |
} break;
|
| 18366 |
+
case LLM_ARCH_GLM4_MOE:
|
| 18367 |
+
{
|
| 18368 |
+
llm = std::make_unique<llm_build_glm4_moe>(*this, params);
|
| 18369 |
+
} break;
|
| 18370 |
case LLM_ARCH_BITNET:
|
| 18371 |
{
|
| 18372 |
llm = std::make_unique<llm_build_bitnet>(*this, params);
|
|
|
|
| 18472 |
{
|
| 18473 |
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
| 18474 |
} break;
|
| 18475 |
+
case LLM_ARCH_HUNYUAN_DENSE:
|
| 18476 |
+
{
|
| 18477 |
+
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
| 18478 |
+
} break;
|
| 18479 |
case LLM_ARCH_SMOLLM3:
|
| 18480 |
{
|
| 18481 |
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
| 18482 |
} break;
|
| 18483 |
+
case LLM_ARCH_OPENAI_MOE:
|
| 18484 |
+
{
|
| 18485 |
+
llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
|
| 18486 |
+
} break;
|
| 18487 |
case LLM_ARCH_FALCON_H1:
|
| 18488 |
{
|
| 18489 |
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
|
|
| 18492 |
{
|
| 18493 |
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
| 18494 |
} break;
|
| 18495 |
+
case LLM_ARCH_SMALLTHINKER:
|
| 18496 |
+
{
|
| 18497 |
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
| 18498 |
+
llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
|
| 18499 |
+
} else {
|
| 18500 |
+
llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
|
| 18501 |
+
}
|
| 18502 |
+
} break;
|
| 18503 |
default:
|
| 18504 |
GGML_ABORT("fatal error");
|
| 18505 |
}
|
|
|
|
| 18529 |
/*.use_mmap =*/ true,
|
| 18530 |
/*.use_mlock =*/ false,
|
| 18531 |
/*.check_tensors =*/ false,
|
| 18532 |
+
/*.use_extra_bufts =*/ true,
|
| 18533 |
};
|
| 18534 |
|
| 18535 |
#ifdef GGML_USE_METAL
|
|
|
|
| 18632 |
|
| 18633 |
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
| 18634 |
case LLM_ARCH_LLAMA:
|
| 18635 |
+
case LLM_ARCH_LLADA:
|
| 18636 |
case LLM_ARCH_LLAMA4:
|
| 18637 |
case LLM_ARCH_DECI:
|
| 18638 |
case LLM_ARCH_BAICHUAN:
|
|
|
|
| 18699 |
case LLM_ARCH_MINICPM3:
|
| 18700 |
case LLM_ARCH_DOTS1:
|
| 18701 |
case LLM_ARCH_HUNYUAN_MOE:
|
| 18702 |
+
case LLM_ARCH_OPENAI_MOE:
|
| 18703 |
+
case LLM_ARCH_HUNYUAN_DENSE:
|
| 18704 |
case LLM_ARCH_LFM2:
|
| 18705 |
+
case LLM_ARCH_SMALLTHINKER:
|
| 18706 |
+
case LLM_ARCH_GLM4_MOE:
|
| 18707 |
return LLAMA_ROPE_TYPE_NEOX;
|
| 18708 |
|
| 18709 |
case LLM_ARCH_QWEN2VL:
|
|
|
|
| 18814 |
return llm_arch_is_recurrent(model->arch);
|
| 18815 |
}
|
| 18816 |
|
| 18817 |
+
bool llama_model_is_diffusion(const llama_model * model) {
|
| 18818 |
+
return llm_arch_is_diffusion(model->arch);
|
| 18819 |
+
}
|
| 18820 |
+
|
| 18821 |
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
| 18822 |
return model->tensors_by_name;
|
| 18823 |
}
|
examples/talk-llama/llama-model.h
CHANGED
|
@@ -39,6 +39,7 @@ enum llm_type {
|
|
| 39 |
LLM_TYPE_410M,
|
| 40 |
LLM_TYPE_450M,
|
| 41 |
LLM_TYPE_475M,
|
|
|
|
| 42 |
LLM_TYPE_700M,
|
| 43 |
LLM_TYPE_770M,
|
| 44 |
LLM_TYPE_780M,
|
|
@@ -101,8 +102,10 @@ enum llm_type {
|
|
| 101 |
LLM_TYPE_A13B,
|
| 102 |
LLM_TYPE_21B_A3B, // Ernie MoE small
|
| 103 |
LLM_TYPE_30B_A3B,
|
|
|
|
| 104 |
LLM_TYPE_235B_A22B,
|
| 105 |
LLM_TYPE_300B_A47B, // Ernie MoE big
|
|
|
|
| 106 |
LLM_TYPE_E2B,
|
| 107 |
LLM_TYPE_E4B,
|
| 108 |
};
|
|
@@ -166,6 +169,15 @@ struct llama_layer_shortconv {
|
|
| 166 |
struct ggml_tensor * out_proj = nullptr;
|
| 167 |
};
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
struct llama_layer {
|
| 170 |
// normalization
|
| 171 |
struct ggml_tensor * attn_norm = nullptr;
|
|
@@ -241,10 +253,14 @@ struct llama_layer {
|
|
| 241 |
struct ggml_tensor * ffn_up_enc = nullptr;
|
| 242 |
|
| 243 |
// ff MoE
|
| 244 |
-
struct ggml_tensor * ffn_gate_inp
|
| 245 |
-
struct ggml_tensor * ffn_gate_exps
|
| 246 |
-
struct ggml_tensor * ffn_down_exps
|
| 247 |
-
struct ggml_tensor * ffn_up_exps
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
// ff shared expert (shexp)
|
| 250 |
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
|
@@ -349,11 +365,16 @@ struct llama_layer {
|
|
| 349 |
struct ggml_tensor * laurel_r = nullptr;
|
| 350 |
struct ggml_tensor * laurel_post_norm = nullptr;
|
| 351 |
|
|
|
|
|
|
|
|
|
|
| 352 |
struct llama_layer_posnet posnet;
|
| 353 |
|
| 354 |
struct llama_layer_convnext convnext;
|
| 355 |
|
| 356 |
struct llama_layer_shortconv shortconv;
|
|
|
|
|
|
|
| 357 |
};
|
| 358 |
|
| 359 |
struct llama_model {
|
|
|
|
| 39 |
LLM_TYPE_410M,
|
| 40 |
LLM_TYPE_450M,
|
| 41 |
LLM_TYPE_475M,
|
| 42 |
+
LLM_TYPE_537M,
|
| 43 |
LLM_TYPE_700M,
|
| 44 |
LLM_TYPE_770M,
|
| 45 |
LLM_TYPE_780M,
|
|
|
|
| 102 |
LLM_TYPE_A13B,
|
| 103 |
LLM_TYPE_21B_A3B, // Ernie MoE small
|
| 104 |
LLM_TYPE_30B_A3B,
|
| 105 |
+
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
| 106 |
LLM_TYPE_235B_A22B,
|
| 107 |
LLM_TYPE_300B_A47B, // Ernie MoE big
|
| 108 |
+
LLM_TYPE_355B_A32B, // GLM-4.5
|
| 109 |
LLM_TYPE_E2B,
|
| 110 |
LLM_TYPE_E4B,
|
| 111 |
};
|
|
|
|
| 169 |
struct ggml_tensor * out_proj = nullptr;
|
| 170 |
};
|
| 171 |
|
| 172 |
+
struct llama_layer_nextn {
|
| 173 |
+
struct ggml_tensor * eh_proj = nullptr;
|
| 174 |
+
struct ggml_tensor * embed_tokens = nullptr;
|
| 175 |
+
struct ggml_tensor * enorm = nullptr;
|
| 176 |
+
struct ggml_tensor * hnorm = nullptr;
|
| 177 |
+
struct ggml_tensor * shared_head_head = nullptr;
|
| 178 |
+
struct ggml_tensor * shared_head_norm = nullptr;
|
| 179 |
+
};
|
| 180 |
+
|
| 181 |
struct llama_layer {
|
| 182 |
// normalization
|
| 183 |
struct ggml_tensor * attn_norm = nullptr;
|
|
|
|
| 253 |
struct ggml_tensor * ffn_up_enc = nullptr;
|
| 254 |
|
| 255 |
// ff MoE
|
| 256 |
+
struct ggml_tensor * ffn_gate_inp = nullptr;
|
| 257 |
+
struct ggml_tensor * ffn_gate_exps = nullptr;
|
| 258 |
+
struct ggml_tensor * ffn_down_exps = nullptr;
|
| 259 |
+
struct ggml_tensor * ffn_up_exps = nullptr;
|
| 260 |
+
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
| 261 |
+
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
| 262 |
+
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
| 263 |
+
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
| 264 |
|
| 265 |
// ff shared expert (shexp)
|
| 266 |
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
|
|
|
| 365 |
struct ggml_tensor * laurel_r = nullptr;
|
| 366 |
struct ggml_tensor * laurel_post_norm = nullptr;
|
| 367 |
|
| 368 |
+
// openai-moe
|
| 369 |
+
struct ggml_tensor * attn_sinks = nullptr;
|
| 370 |
+
|
| 371 |
struct llama_layer_posnet posnet;
|
| 372 |
|
| 373 |
struct llama_layer_convnext convnext;
|
| 374 |
|
| 375 |
struct llama_layer_shortconv shortconv;
|
| 376 |
+
|
| 377 |
+
struct llama_layer_nextn nextn;
|
| 378 |
};
|
| 379 |
|
| 380 |
struct llama_model {
|
examples/talk-llama/llama-quant.cpp
CHANGED
|
@@ -211,7 +211,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
| 211 |
const int64_t nx = tensor->ne[0];
|
| 212 |
const int64_t qk_k = ggml_blck_size(new_type);
|
| 213 |
|
| 214 |
-
if (
|
|
|
|
|
|
|
|
|
|
| 215 |
new_type = GGML_TYPE_Q8_0;
|
| 216 |
}
|
| 217 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
@@ -223,6 +226,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
| 223 |
new_type = GGML_TYPE_Q6_K;
|
| 224 |
}
|
| 225 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
| 227 |
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
| 228 |
new_type = qs.params->token_embedding_type;
|
|
@@ -533,6 +544,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 533 |
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
| 534 |
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
| 535 |
|
|
|
|
|
|
|
| 536 |
// K-quants
|
| 537 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
| 538 |
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
|
|
@@ -875,9 +888,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 875 |
|
| 876 |
// get more optimal quantization type based on the tensor shape, layer, etc.
|
| 877 |
if (!params->pure && ggml_is_quantized(default_type)) {
|
|
|
|
| 878 |
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
| 879 |
-
// unless the user specifies a type
|
| 880 |
-
if (params->tensor_types) {
|
| 881 |
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
| 882 |
const std::string tensor_name(tensor->name);
|
| 883 |
for (const auto & [tname, qtype] : tensor_types) {
|
|
@@ -890,7 +904,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 890 |
}
|
| 891 |
}
|
| 892 |
}
|
| 893 |
-
|
| 894 |
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
| 895 |
new_type = params->token_embedding_type;
|
| 896 |
}
|
|
@@ -984,6 +997,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 984 |
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
| 985 |
|
| 986 |
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
}
|
| 988 |
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
| 989 |
}
|
|
|
|
| 211 |
const int64_t nx = tensor->ne[0];
|
| 212 |
const int64_t qk_k = ggml_blck_size(new_type);
|
| 213 |
|
| 214 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
|
| 215 |
+
new_type = GGML_TYPE_Q8_0;
|
| 216 |
+
}
|
| 217 |
+
else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
| 218 |
new_type = GGML_TYPE_Q8_0;
|
| 219 |
}
|
| 220 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
|
|
| 226 |
new_type = GGML_TYPE_Q6_K;
|
| 227 |
}
|
| 228 |
}
|
| 229 |
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
|
| 230 |
+
// MoE tensors -> MXFP4
|
| 231 |
+
// other tensors -> Q8_0
|
| 232 |
+
if (tensor->ne[2] > 1) {
|
| 233 |
+
new_type = GGML_TYPE_MXFP4;
|
| 234 |
+
} else {
|
| 235 |
+
new_type = GGML_TYPE_Q8_0;
|
| 236 |
+
}
|
| 237 |
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
| 238 |
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
| 239 |
new_type = qs.params->token_embedding_type;
|
|
|
|
| 544 |
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
| 545 |
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
| 546 |
|
| 547 |
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
|
| 548 |
+
|
| 549 |
// K-quants
|
| 550 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
| 551 |
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
|
|
|
|
| 888 |
|
| 889 |
// get more optimal quantization type based on the tensor shape, layer, etc.
|
| 890 |
if (!params->pure && ggml_is_quantized(default_type)) {
|
| 891 |
+
int fallback = qs.n_fallback;
|
| 892 |
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
| 893 |
+
// unless the user specifies a type, and the tensor geometry will not require fallback quantisation
|
| 894 |
+
if (params->tensor_types && qs.n_fallback - fallback == 0) {
|
| 895 |
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
| 896 |
const std::string tensor_name(tensor->name);
|
| 897 |
for (const auto & [tname, qtype] : tensor_types) {
|
|
|
|
| 904 |
}
|
| 905 |
}
|
| 906 |
}
|
|
|
|
| 907 |
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
| 908 |
new_type = params->token_embedding_type;
|
| 909 |
}
|
|
|
|
| 997 |
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
| 998 |
|
| 999 |
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
| 1000 |
+
|
| 1001 |
+
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
| 1002 |
+
#if 0
|
| 1003 |
+
if (new_type == GGML_TYPE_MXFP4) {
|
| 1004 |
+
auto * x = f32_data_03;
|
| 1005 |
+
|
| 1006 |
+
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
| 1007 |
+
std::vector<float> deq(nrows*n_per_row);
|
| 1008 |
+
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
| 1009 |
+
qtype->to_float(new_data_03, deq.data(), deq.size());
|
| 1010 |
+
|
| 1011 |
+
double err = 0.0f;
|
| 1012 |
+
for (int i = 0; i < (int) deq.size(); ++i) {
|
| 1013 |
+
err += fabsf(deq[i] - x[i]);
|
| 1014 |
+
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
| 1015 |
+
if (deq[i] != x[i]) {
|
| 1016 |
+
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
| 1017 |
+
}
|
| 1018 |
+
}
|
| 1019 |
+
//LLAMA_LOG_INFO("err = %f\n", err);
|
| 1020 |
+
GGML_ASSERT(err == 0.00000);
|
| 1021 |
+
}
|
| 1022 |
+
#endif
|
| 1023 |
}
|
| 1024 |
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
| 1025 |
}
|
examples/talk-llama/llama-vocab.cpp
CHANGED
|
@@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
| 307 |
};
|
| 308 |
break;
|
| 309 |
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
|
|
|
| 310 |
regex_exprs = {
|
| 311 |
"\\p{N}{1,3}",
|
| 312 |
"[一-龥-ゟ゠-ヿ]+",
|
|
@@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 1855 |
tokenizer_pre == "gigachat" ||
|
| 1856 |
tokenizer_pre == "jina-v2-es" ||
|
| 1857 |
tokenizer_pre == "jina-v2-de" ||
|
| 1858 |
-
tokenizer_pre == "a.x-4.0"
|
|
|
|
| 1859 |
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
| 1860 |
} else if (
|
| 1861 |
tokenizer_pre == "jina-v1-en" ||
|
|
@@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 1964 |
tokenizer_pre == "hunyuan") {
|
| 1965 |
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
| 1966 |
clean_spaces = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1967 |
} else if (
|
| 1968 |
tokenizer_pre == "kimi-k2") {
|
| 1969 |
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
|
@@ -2185,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 2185 |
|| t.first == "<|fim▁begin|>" // DeepSeek
|
| 2186 |
|| t.first == "<PRE>"
|
| 2187 |
|| t.first == "▁<PRE>" // CodeLlama
|
|
|
|
| 2188 |
) {
|
| 2189 |
special_fim_pre_id = t.second;
|
| 2190 |
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2204,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 2204 |
|| t.first == "<|fim▁hole|>" // DeepSeek
|
| 2205 |
|| t.first == "<SUF>"
|
| 2206 |
|| t.first == "▁<SUF>" // CodeLlama
|
|
|
|
| 2207 |
) {
|
| 2208 |
special_fim_suf_id = t.second;
|
| 2209 |
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2223,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 2223 |
|| t.first == "<|fim▁end|>" // DeepSeek
|
| 2224 |
|| t.first == "<MID>"
|
| 2225 |
|| t.first == "▁<MID>" // CodeLlama
|
|
|
|
| 2226 |
) {
|
| 2227 |
special_fim_mid_id = t.second;
|
| 2228 |
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2305,6 +2314,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 2305 |
|| t.first == "<|eot_id|>"
|
| 2306 |
|| t.first == "<|im_end|>"
|
| 2307 |
|| t.first == "<|end|>"
|
|
|
|
|
|
|
| 2308 |
|| t.first == "<end_of_turn>"
|
| 2309 |
|| t.first == "<|endoftext|>"
|
| 2310 |
|| t.first == "<|eom_id|>"
|
|
@@ -2328,6 +2339,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 2328 |
}
|
| 2329 |
}
|
| 2330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2331 |
// sanity checks
|
| 2332 |
if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
|
| 2333 |
special_eog_ids.insert(special_eos_id);
|
|
@@ -2343,6 +2361,37 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
| 2343 |
special_eog_ids.insert(special_eom_id);
|
| 2344 |
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
| 2345 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2346 |
}
|
| 2347 |
|
| 2348 |
// build special tokens cache
|
|
|
|
| 307 |
};
|
| 308 |
break;
|
| 309 |
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
| 310 |
+
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
|
| 311 |
regex_exprs = {
|
| 312 |
"\\p{N}{1,3}",
|
| 313 |
"[一-龥-ゟ゠-ヿ]+",
|
|
|
|
| 1856 |
tokenizer_pre == "gigachat" ||
|
| 1857 |
tokenizer_pre == "jina-v2-es" ||
|
| 1858 |
tokenizer_pre == "jina-v2-de" ||
|
| 1859 |
+
tokenizer_pre == "a.x-4.0" ||
|
| 1860 |
+
tokenizer_pre == "mellum") {
|
| 1861 |
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
| 1862 |
} else if (
|
| 1863 |
tokenizer_pre == "jina-v1-en" ||
|
|
|
|
| 1966 |
tokenizer_pre == "hunyuan") {
|
| 1967 |
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
| 1968 |
clean_spaces = false;
|
| 1969 |
+
} else if (
|
| 1970 |
+
tokenizer_pre == "hunyuan-dense") {
|
| 1971 |
+
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
|
| 1972 |
+
clean_spaces = false;
|
| 1973 |
} else if (
|
| 1974 |
tokenizer_pre == "kimi-k2") {
|
| 1975 |
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
|
|
|
| 2191 |
|| t.first == "<|fim▁begin|>" // DeepSeek
|
| 2192 |
|| t.first == "<PRE>"
|
| 2193 |
|| t.first == "▁<PRE>" // CodeLlama
|
| 2194 |
+
|| t.first == "<|code_prefix|>" // GLM-4.5
|
| 2195 |
) {
|
| 2196 |
special_fim_pre_id = t.second;
|
| 2197 |
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
|
|
| 2211 |
|| t.first == "<|fim▁hole|>" // DeepSeek
|
| 2212 |
|| t.first == "<SUF>"
|
| 2213 |
|| t.first == "▁<SUF>" // CodeLlama
|
| 2214 |
+
|| t.first == "<|code_suffix|>" // GLM-4.5
|
| 2215 |
) {
|
| 2216 |
special_fim_suf_id = t.second;
|
| 2217 |
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
|
|
| 2231 |
|| t.first == "<|fim▁end|>" // DeepSeek
|
| 2232 |
|| t.first == "<MID>"
|
| 2233 |
|| t.first == "▁<MID>" // CodeLlama
|
| 2234 |
+
|| t.first == "<|code_middle|>" // GLM-4.5
|
| 2235 |
) {
|
| 2236 |
special_fim_mid_id = t.second;
|
| 2237 |
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
|
|
| 2314 |
|| t.first == "<|eot_id|>"
|
| 2315 |
|| t.first == "<|im_end|>"
|
| 2316 |
|| t.first == "<|end|>"
|
| 2317 |
+
|| t.first == "<|return|>" // o200k_harmony
|
| 2318 |
+
|| t.first == "<|call|>" // o200k_harmony
|
| 2319 |
|| t.first == "<end_of_turn>"
|
| 2320 |
|| t.first == "<|endoftext|>"
|
| 2321 |
|| t.first == "<|eom_id|>"
|
|
|
|
| 2339 |
}
|
| 2340 |
}
|
| 2341 |
|
| 2342 |
+
// @ngxson : quick hack for gpt-oss, always render these tokens
|
| 2343 |
+
for (const auto & t : token_to_id) {
|
| 2344 |
+
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
| 2345 |
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
| 2346 |
+
}
|
| 2347 |
+
}
|
| 2348 |
+
|
| 2349 |
// sanity checks
|
| 2350 |
if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
|
| 2351 |
special_eog_ids.insert(special_eos_id);
|
|
|
|
| 2361 |
special_eog_ids.insert(special_eom_id);
|
| 2362 |
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
| 2363 |
}
|
| 2364 |
+
|
| 2365 |
+
// TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
|
| 2366 |
+
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
|
| 2367 |
+
// we remove the "<|end|>" token from the EOG list
|
| 2368 |
+
{
|
| 2369 |
+
bool has_return = false;
|
| 2370 |
+
bool has_call = false;
|
| 2371 |
+
bool has_end = false;
|
| 2372 |
+
|
| 2373 |
+
llama_token end_id = LLAMA_TOKEN_NULL;
|
| 2374 |
+
|
| 2375 |
+
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
|
| 2376 |
+
for (auto tid : special_eog_ids) {
|
| 2377 |
+
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
|
| 2378 |
+
|
| 2379 |
+
if (id_to_token[tid].text == "<|return|>") {
|
| 2380 |
+
has_return = true;
|
| 2381 |
+
} else if (id_to_token[tid].text == "<|call|>") {
|
| 2382 |
+
has_call = true;
|
| 2383 |
+
} else if (id_to_token[tid].text == "<|end|>") {
|
| 2384 |
+
has_end = true;
|
| 2385 |
+
end_id = tid;
|
| 2386 |
+
}
|
| 2387 |
+
}
|
| 2388 |
+
|
| 2389 |
+
if (has_return && has_call && has_end) {
|
| 2390 |
+
special_eog_ids.erase(end_id);
|
| 2391 |
+
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
| 2392 |
+
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
| 2393 |
+
}
|
| 2394 |
+
}
|
| 2395 |
}
|
| 2396 |
|
| 2397 |
// build special tokens cache
|
examples/talk-llama/llama-vocab.h
CHANGED
|
@@ -46,6 +46,7 @@ enum llama_vocab_pre_type {
|
|
| 46 |
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
| 47 |
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
| 48 |
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
|
|
|
| 49 |
};
|
| 50 |
|
| 51 |
struct LLM_KV;
|
|
|
|
| 46 |
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
| 47 |
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
| 48 |
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
| 49 |
+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
| 50 |
};
|
| 51 |
|
| 52 |
struct LLM_KV;
|
examples/talk-llama/llama.h
CHANGED
|
@@ -152,6 +152,7 @@ extern "C" {
|
|
| 152 |
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
| 153 |
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
| 154 |
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
|
|
| 155 |
|
| 156 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 157 |
};
|
|
@@ -284,10 +285,11 @@ extern "C" {
|
|
| 284 |
const struct llama_model_kv_override * kv_overrides;
|
| 285 |
|
| 286 |
// Keep the booleans together to avoid misalignment during copy-by-value.
|
| 287 |
-
bool vocab_only;
|
| 288 |
-
bool use_mmap;
|
| 289 |
-
bool use_mlock;
|
| 290 |
-
bool check_tensors;
|
|
|
|
| 291 |
};
|
| 292 |
|
| 293 |
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -537,6 +539,9 @@ extern "C" {
|
|
| 537 |
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
| 538 |
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
| 539 |
|
|
|
|
|
|
|
|
|
|
| 540 |
// Returns 0 on success
|
| 541 |
LLAMA_API uint32_t llama_model_quantize(
|
| 542 |
const char * fname_inp,
|
|
@@ -865,6 +870,29 @@ extern "C" {
|
|
| 865 |
size_t n_token_capacity,
|
| 866 |
size_t * n_token_count_out);
|
| 867 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
//
|
| 869 |
// Decoding
|
| 870 |
//
|
|
@@ -1432,6 +1460,8 @@ extern "C" {
|
|
| 1432 |
|
| 1433 |
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
| 1434 |
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
|
|
|
|
|
| 1435 |
};
|
| 1436 |
|
| 1437 |
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
|
|
|
| 152 |
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
| 153 |
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
| 154 |
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
| 155 |
+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
| 156 |
|
| 157 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 158 |
};
|
|
|
|
| 285 |
const struct llama_model_kv_override * kv_overrides;
|
| 286 |
|
| 287 |
// Keep the booleans together to avoid misalignment during copy-by-value.
|
| 288 |
+
bool vocab_only; // only load the vocabulary, no weights
|
| 289 |
+
bool use_mmap; // use mmap if possible
|
| 290 |
+
bool use_mlock; // force system to keep model in RAM
|
| 291 |
+
bool check_tensors; // validate model tensor data
|
| 292 |
+
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
| 293 |
};
|
| 294 |
|
| 295 |
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
|
|
| 539 |
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
| 540 |
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
| 541 |
|
| 542 |
+
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
| 543 |
+
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
| 544 |
+
|
| 545 |
// Returns 0 on success
|
| 546 |
LLAMA_API uint32_t llama_model_quantize(
|
| 547 |
const char * fname_inp,
|
|
|
|
| 870 |
size_t n_token_capacity,
|
| 871 |
size_t * n_token_count_out);
|
| 872 |
|
| 873 |
+
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
| 874 |
+
|
| 875 |
+
typedef uint32_t llama_state_seq_flags;
|
| 876 |
+
|
| 877 |
+
LLAMA_API size_t llama_state_seq_get_size_ext(
|
| 878 |
+
struct llama_context * ctx,
|
| 879 |
+
llama_seq_id seq_id,
|
| 880 |
+
llama_state_seq_flags flags);
|
| 881 |
+
|
| 882 |
+
LLAMA_API size_t llama_state_seq_get_data_ext(
|
| 883 |
+
struct llama_context * ctx,
|
| 884 |
+
uint8_t * dst,
|
| 885 |
+
size_t size,
|
| 886 |
+
llama_seq_id seq_id,
|
| 887 |
+
llama_state_seq_flags flags);
|
| 888 |
+
|
| 889 |
+
LLAMA_API size_t llama_state_seq_set_data_ext(
|
| 890 |
+
struct llama_context * ctx,
|
| 891 |
+
const uint8_t * src,
|
| 892 |
+
size_t size,
|
| 893 |
+
llama_seq_id dest_seq_id,
|
| 894 |
+
llama_state_seq_flags flags);
|
| 895 |
+
|
| 896 |
//
|
| 897 |
// Decoding
|
| 898 |
//
|
|
|
|
| 1460 |
|
| 1461 |
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
| 1462 |
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
| 1463 |
+
|
| 1464 |
+
enum ggml_opt_optimizer_type optimizer_type;
|
| 1465 |
};
|
| 1466 |
|
| 1467 |
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|