ggerganov commited on
Commit
4321600
·
1 Parent(s): a0af6fc

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama-arch.cpp CHANGED
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
62
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
63
  { LLM_ARCH_CHATGLM, "chatglm" },
64
  { LLM_ARCH_GLM4, "glm4" },
 
65
  { LLM_ARCH_BITNET, "bitnet" },
66
  { LLM_ARCH_T5, "t5" },
67
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -85,9 +86,13 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
85
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
86
  { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
87
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
 
88
  { LLM_ARCH_SMOLLM3, "smollm3" },
 
89
  { LLM_ARCH_LFM2, "lfm2" },
90
  { LLM_ARCH_DREAM, "dream" },
 
 
91
  { LLM_ARCH_UNKNOWN, "(unknown)" },
92
  };
93
 
@@ -124,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
124
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
125
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
126
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
 
127
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
128
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
129
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -1388,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1388
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1389
  },
1390
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1391
  {
1392
  LLM_ARCH_BITNET,
1393
  {
@@ -1895,6 +1935,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1895
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1896
  },
1897
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1898
  {
1899
  LLM_ARCH_SMOLLM3,
1900
  {
@@ -1912,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1912
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1913
  },
1914
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1915
  {
1916
  LLM_ARCH_LFM2,
1917
  {
@@ -1933,6 +2012,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1933
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1934
  }
1935
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1936
  {
1937
  LLM_ARCH_DREAM,
1938
  {
@@ -1950,6 +2050,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1950
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1951
  },
1952
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1953
  {
1954
  LLM_ARCH_UNKNOWN,
1955
  {
@@ -1989,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1989
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1990
  {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1991
  {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
 
1992
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1993
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1994
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2120,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2120
  {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2121
  {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2122
  {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
 
 
 
 
 
 
 
 
2123
  };
2124
 
2125
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -2202,6 +2328,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
2202
  bool llm_arch_is_diffusion(const llm_arch & arch) {
2203
  switch (arch) {
2204
  case LLM_ARCH_DREAM:
 
2205
  return true;
2206
  default:
2207
  return false;
 
62
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
63
  { LLM_ARCH_CHATGLM, "chatglm" },
64
  { LLM_ARCH_GLM4, "glm4" },
65
+ { LLM_ARCH_GLM4_MOE, "glm4moe" },
66
  { LLM_ARCH_BITNET, "bitnet" },
67
  { LLM_ARCH_T5, "t5" },
68
  { LLM_ARCH_T5ENCODER, "t5encoder" },
 
86
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
87
  { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
88
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
89
+ { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
90
  { LLM_ARCH_SMOLLM3, "smollm3" },
91
+ { LLM_ARCH_OPENAI_MOE, "gpt-oss" },
92
  { LLM_ARCH_LFM2, "lfm2" },
93
  { LLM_ARCH_DREAM, "dream" },
94
+ { LLM_ARCH_SMALLTHINKER, "smallthinker" },
95
+ { LLM_ARCH_LLADA, "llada" },
96
  { LLM_ARCH_UNKNOWN, "(unknown)" },
97
  };
98
 
 
129
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
130
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
131
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
132
+ { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
133
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
134
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
135
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
 
1394
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1395
  },
1396
  },
1397
+ {
1398
+ LLM_ARCH_GLM4_MOE,
1399
+ {
1400
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1401
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1402
+ { LLM_TENSOR_OUTPUT, "output" },
1403
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1404
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1405
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1406
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1407
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1408
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1409
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1410
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1411
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1412
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1413
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1414
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1415
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1416
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1417
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1418
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1419
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1420
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1421
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1422
+ // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
1423
+ { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
1424
+ { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
1425
+ { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
1426
+ { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
1427
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
1428
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
1429
+ },
1430
+ },
1431
  {
1432
  LLM_ARCH_BITNET,
1433
  {
 
1935
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1936
  },
1937
  },
1938
+ {
1939
+ LLM_ARCH_HUNYUAN_DENSE,
1940
+ {
1941
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1942
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1943
+ { LLM_TENSOR_OUTPUT, "output" },
1944
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1945
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1946
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1947
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1948
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1949
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1950
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1951
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1952
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1953
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1954
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1955
+
1956
+ },
1957
+ },
1958
  {
1959
  LLM_ARCH_SMOLLM3,
1960
  {
 
1972
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1973
  },
1974
  },
1975
+ {
1976
+ LLM_ARCH_OPENAI_MOE,
1977
+ {
1978
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1979
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1980
+ { LLM_TENSOR_OUTPUT, "output" },
1981
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1982
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1983
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1984
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1985
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1986
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1987
+ { LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
1988
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1989
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1990
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1991
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1992
+ },
1993
+ },
1994
  {
1995
  LLM_ARCH_LFM2,
1996
  {
 
2012
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
2013
  }
2014
  },
2015
+ {
2016
+ LLM_ARCH_SMALLTHINKER,
2017
+ {
2018
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2019
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2020
+ { LLM_TENSOR_OUTPUT, "output" },
2021
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2022
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2023
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2024
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2025
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2026
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2027
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2028
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2029
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2030
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2031
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2032
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2033
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
2034
+ },
2035
+ },
2036
  {
2037
  LLM_ARCH_DREAM,
2038
  {
 
2050
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2051
  },
2052
  },
2053
+ {
2054
+ LLM_ARCH_LLADA,
2055
+ {
2056
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2057
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2058
+ { LLM_TENSOR_OUTPUT, "output" },
2059
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2060
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2061
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2062
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2063
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2064
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2065
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2066
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2067
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2068
+ },
2069
+ },
2070
  {
2071
  LLM_ARCH_UNKNOWN,
2072
  {
 
2106
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2107
  {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2108
  {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2109
+ {LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
2110
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2111
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2112
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
 
2238
  {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2239
  {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2240
  {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2241
+ // NextN/MTP tensors are currently ignored (reserved for future MTP support)
2242
+ // These tensors only exist in the last layer(s) and are treated as output tensors
2243
+ {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2244
+ {LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2245
+ {LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2246
+ {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2247
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2248
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2249
  };
2250
 
2251
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
 
2328
  bool llm_arch_is_diffusion(const llm_arch & arch) {
2329
  switch (arch) {
2330
  case LLM_ARCH_DREAM:
2331
+ case LLM_ARCH_LLADA:
2332
  return true;
2333
  default:
2334
  return false;
examples/talk-llama/llama-arch.h CHANGED
@@ -66,6 +66,7 @@ enum llm_arch {
66
  LLM_ARCH_DEEPSEEK2,
67
  LLM_ARCH_CHATGLM,
68
  LLM_ARCH_GLM4,
 
69
  LLM_ARCH_BITNET,
70
  LLM_ARCH_T5,
71
  LLM_ARCH_T5ENCODER,
@@ -89,9 +90,13 @@ enum llm_arch {
89
  LLM_ARCH_ERNIE4_5,
90
  LLM_ARCH_ERNIE4_5_MOE,
91
  LLM_ARCH_HUNYUAN_MOE,
 
92
  LLM_ARCH_SMOLLM3,
 
93
  LLM_ARCH_LFM2,
94
  LLM_ARCH_DREAM,
 
 
95
  LLM_ARCH_UNKNOWN,
96
  };
97
 
@@ -128,6 +133,7 @@ enum llm_kv {
128
  LLM_KV_EXPERT_WEIGHTS_NORM,
129
  LLM_KV_EXPERT_GATING_FUNC,
130
  LLM_KV_MOE_EVERY_N_LAYERS,
 
131
  LLM_KV_POOLING_TYPE,
132
  LLM_KV_LOGIT_SCALE,
133
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -260,6 +266,7 @@ enum llm_tensor {
260
  LLM_TENSOR_ATTN_OUT_NORM,
261
  LLM_TENSOR_ATTN_POST_NORM,
262
  LLM_TENSOR_ATTN_ROT_EMBD,
 
263
  LLM_TENSOR_FFN_GATE_INP,
264
  LLM_TENSOR_FFN_GATE_INP_SHEXP,
265
  LLM_TENSOR_FFN_NORM,
@@ -406,6 +413,12 @@ enum llm_tensor {
406
  LLM_TENSOR_SHORTCONV_CONV,
407
  LLM_TENSOR_SHORTCONV_INPROJ,
408
  LLM_TENSOR_SHORTCONV_OUTPROJ,
 
 
 
 
 
 
409
  };
410
 
411
  enum llm_tensor_layer {
 
66
  LLM_ARCH_DEEPSEEK2,
67
  LLM_ARCH_CHATGLM,
68
  LLM_ARCH_GLM4,
69
+ LLM_ARCH_GLM4_MOE,
70
  LLM_ARCH_BITNET,
71
  LLM_ARCH_T5,
72
  LLM_ARCH_T5ENCODER,
 
90
  LLM_ARCH_ERNIE4_5,
91
  LLM_ARCH_ERNIE4_5_MOE,
92
  LLM_ARCH_HUNYUAN_MOE,
93
+ LLM_ARCH_HUNYUAN_DENSE,
94
  LLM_ARCH_SMOLLM3,
95
+ LLM_ARCH_OPENAI_MOE,
96
  LLM_ARCH_LFM2,
97
  LLM_ARCH_DREAM,
98
+ LLM_ARCH_SMALLTHINKER,
99
+ LLM_ARCH_LLADA,
100
  LLM_ARCH_UNKNOWN,
101
  };
102
 
 
133
  LLM_KV_EXPERT_WEIGHTS_NORM,
134
  LLM_KV_EXPERT_GATING_FUNC,
135
  LLM_KV_MOE_EVERY_N_LAYERS,
136
+ LLM_KV_NEXTN_PREDICT_LAYERS,
137
  LLM_KV_POOLING_TYPE,
138
  LLM_KV_LOGIT_SCALE,
139
  LLM_KV_DECODER_START_TOKEN_ID,
 
266
  LLM_TENSOR_ATTN_OUT_NORM,
267
  LLM_TENSOR_ATTN_POST_NORM,
268
  LLM_TENSOR_ATTN_ROT_EMBD,
269
+ LLM_TENSOR_ATTN_SINKS,
270
  LLM_TENSOR_FFN_GATE_INP,
271
  LLM_TENSOR_FFN_GATE_INP_SHEXP,
272
  LLM_TENSOR_FFN_NORM,
 
413
  LLM_TENSOR_SHORTCONV_CONV,
414
  LLM_TENSOR_SHORTCONV_INPROJ,
415
  LLM_TENSOR_SHORTCONV_OUTPROJ,
416
+ LLM_TENSOR_NEXTN_EH_PROJ,
417
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
418
+ LLM_TENSOR_NEXTN_ENORM,
419
+ LLM_TENSOR_NEXTN_HNORM,
420
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
421
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
422
  };
423
 
424
  enum llm_tensor_layer {
examples/talk-llama/llama-batch.cpp CHANGED
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
59
  for (int32_t i = 0; i < batch.n_tokens; ++i) {
60
  for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
61
  if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
62
- LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
63
  return false;
64
  }
65
  }
@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
477
 
478
  llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
479
  if (sequential && has_cpl) {
480
- LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
481
 
482
  return {};
483
  }
 
59
  for (int32_t i = 0; i < batch.n_tokens; ++i) {
60
  for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
61
  if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
62
+ LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
63
  return false;
64
  }
65
  }
 
477
 
478
  llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
479
  if (sequential && has_cpl) {
480
+ LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
481
 
482
  return {};
483
  }
examples/talk-llama/llama-chat.cpp CHANGED
@@ -66,6 +66,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
66
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
67
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
68
  { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
 
 
69
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
70
  };
71
 
@@ -191,8 +193,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
191
  return LLM_CHAT_TEMPLATE_LLAMA4;
192
  } else if (tmpl_contains("<|endofuserprompt|>")) {
193
  return LLM_CHAT_TEMPLATE_DOTS1;
194
- } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
195
  return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
 
 
 
 
196
  } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
197
  return LLM_CHAT_TEMPLATE_KIMI_K2;
198
  }
@@ -619,8 +625,6 @@ int32_t llm_chat_apply_template(
619
  } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
620
  // Yandex template ("\n\n" is defined as EOT token)
621
 
622
- ss << "<s>";
623
-
624
  for (size_t i = 0; i < chat.size(); i++) {
625
  std::string role(chat[i]->role);
626
  if (role == "user") {
@@ -698,11 +702,37 @@ int32_t llm_chat_apply_template(
698
  if (role == "system") {
699
  ss << "<|startoftext|>" << message->content << "<|extra_4|>";
700
  } else if (role == "assistant") {
701
- ss << "<|startoftext|>" << message->content << "<|eos|>";
702
  } else {
703
  ss << "<|startoftext|>" << message->content << "<|extra_0|>";
704
  }
705
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
706
  } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
707
  // moonshotai/Kimi-K2-Instruct
708
  for (auto message : chat) {
 
66
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
67
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
68
  { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
69
+ { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
70
+ { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
71
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
72
  };
73
 
 
193
  return LLM_CHAT_TEMPLATE_LLAMA4;
194
  } else if (tmpl_contains("<|endofuserprompt|>")) {
195
  return LLM_CHAT_TEMPLATE_DOTS1;
196
+ } else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
197
  return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
198
+ } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
199
+ return LLM_CHAT_TEMPLATE_OPENAI_MOE;
200
+ } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
201
+ return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
202
  } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
203
  return LLM_CHAT_TEMPLATE_KIMI_K2;
204
  }
 
625
  } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
626
  // Yandex template ("\n\n" is defined as EOT token)
627
 
 
 
628
  for (size_t i = 0; i < chat.size(); i++) {
629
  std::string role(chat[i]->role);
630
  if (role == "user") {
 
702
  if (role == "system") {
703
  ss << "<|startoftext|>" << message->content << "<|extra_4|>";
704
  } else if (role == "assistant") {
705
+ ss << message->content << "<|eos|>";
706
  } else {
707
  ss << "<|startoftext|>" << message->content << "<|extra_0|>";
708
  }
709
  }
710
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
711
+ // OpenAI MoE (based on Harmony chat template)
712
+ for (auto message : chat) {
713
+ std::string role(message->role);
714
+ ss << "<|start|>" << role << "<|message|>" << message->content;
715
+ ss << (role == "assistant" ? "<|return|>" : "<|end|>");
716
+ }
717
+ if (add_ass) {
718
+ ss << "<|start|>assistant";
719
+ }
720
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
721
+ // tencent/Hunyuan-4B-Instruct
722
+ for (size_t i = 0; i < chat.size(); i++) {
723
+ std::string role(chat[i]->role);
724
+ if (i == 0) {
725
+ if (role == "system") {
726
+ ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
727
+ }
728
+ }
729
+
730
+ if (role == "assistant") {
731
+ ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
732
+ } else if (role == "user") {
733
+ ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
734
+ }
735
+ }
736
  } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
737
  // moonshotai/Kimi-K2-Instruct
738
  for (auto message : chat) {
examples/talk-llama/llama-chat.h CHANGED
@@ -46,6 +46,8 @@ enum llm_chat_template {
46
  LLM_CHAT_TEMPLATE_SMOLVLM,
47
  LLM_CHAT_TEMPLATE_DOTS1,
48
  LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
 
 
49
  LLM_CHAT_TEMPLATE_KIMI_K2,
50
  LLM_CHAT_TEMPLATE_UNKNOWN,
51
  };
 
46
  LLM_CHAT_TEMPLATE_SMOLVLM,
47
  LLM_CHAT_TEMPLATE_DOTS1,
48
  LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
49
+ LLM_CHAT_TEMPLATE_OPENAI_MOE,
50
+ LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
51
  LLM_CHAT_TEMPLATE_KIMI_K2,
52
  LLM_CHAT_TEMPLATE_UNKNOWN,
53
  };
examples/talk-llama/llama-context.cpp CHANGED
@@ -105,7 +105,7 @@ llama_context::llama_context(
105
 
106
  {
107
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108
- supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
109
 
110
  if (!supports_set_rows && !cparams.kv_unified) {
111
  LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -113,6 +113,15 @@ llama_context::llama_context(
113
  }
114
  }
115
 
 
 
 
 
 
 
 
 
 
116
  const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
117
 
118
  LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
716
  // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
717
  const auto gparams = graph_params(res, ubatch, mctx, gtype);
718
 
719
- if (res->can_reuse(gparams)) {
720
  //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
721
 
722
  n_reused++;
@@ -777,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
777
  const auto & hparams = model.hparams;
778
 
779
  const int64_t n_embd = hparams.n_embd;
780
- const int32_t n_vocab = model.vocab.n_tokens();
781
 
782
  // note: during encode, we always pass the full sequence starting from pos = 0
783
  if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
@@ -950,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
950
  const auto & vocab = model.vocab;
951
  const auto & hparams = model.hparams;
952
 
953
- const int32_t n_vocab = vocab.n_tokens();
954
  const int64_t n_embd = hparams.n_embd;
955
 
956
  // when computing embeddings, all tokens are output
@@ -1319,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1319
  }
1320
 
1321
  void llama_context::output_reorder() {
1322
- const uint32_t n_vocab = model.vocab.n_tokens();
1323
  const uint64_t n_embd = model.hparams.n_embd;
1324
 
1325
- for (uint32_t s = 0; s < output_swaps.size(); ++s) {
1326
- const uint32_t i0 = output_swaps[s].i0;
1327
- const uint32_t i1 = output_swaps[s].i1;
1328
 
1329
  if (logits_size > 0) {
1330
- for (uint32_t k = 0; k < n_vocab; k++) {
1331
  std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
1332
  }
1333
  }
1334
 
1335
  if (embd_size > 0) {
1336
- for (uint32_t k = 0; k < n_embd; k++) {
1337
  std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
1338
  }
1339
  }
@@ -1648,30 +1657,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
1648
  }
1649
  }
1650
 
1651
- size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
1652
  llama_io_write_dummy io;
1653
  try {
1654
- return state_seq_write_data(io, seq_id);
1655
  } catch (const std::exception & err) {
1656
  LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
1657
  return 0;
1658
  }
1659
  }
1660
 
1661
- size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
1662
  llama_io_write_buffer io(dst, size);
1663
  try {
1664
- return state_seq_write_data(io, seq_id);
1665
  } catch (const std::exception & err) {
1666
  LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
1667
  return 0;
1668
  }
1669
  }
1670
 
1671
- size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
1672
  llama_io_read_buffer io(src, size);
1673
  try {
1674
- return state_seq_read_data(io, seq_id);
1675
  } catch (const std::exception & err) {
1676
  LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
1677
  return 0;
@@ -1769,7 +1778,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
1769
  {
1770
  const size_t state_size = file.size() - file.tell();
1771
  llama_io_read_file io(&file);
1772
- const size_t nread = state_seq_read_data(io, seq_id);
1773
  if (!nread) {
1774
  LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
1775
  return 0;
@@ -1793,7 +1802,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
1793
 
1794
  // save the context state using stream saving
1795
  llama_io_write_file io(&file);
1796
- state_seq_write_data(io, seq_id);
1797
 
1798
  const size_t res = file.tell();
1799
  GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
@@ -1962,21 +1971,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
1962
  return io.n_bytes();
1963
  }
1964
 
1965
- size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
1966
  GGML_UNUSED(seq_id);
1967
 
1968
  if (memory) {
1969
- memory->state_write(io, seq_id);
1970
  }
1971
 
1972
  return io.n_bytes();
1973
  }
1974
 
1975
- size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
1976
  GGML_UNUSED(seq_id);
1977
 
1978
  if (memory) {
1979
- memory->state_read(io, seq_id);
1980
  }
1981
 
1982
  return io.n_bytes();
@@ -2039,7 +2048,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
2039
  opt_params.opt_period = n_batch / n_ubatch;
2040
  opt_params.get_opt_pars = lopt_params.get_opt_pars;
2041
  opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
2042
-
2043
  opt_ctx = ggml_opt_init(opt_params);
2044
 
2045
  llama_opt_param_filter param_filter = lopt_params.param_filter;
@@ -2792,19 +2801,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
2792
  }
2793
 
2794
  size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
2795
- return ctx->state_seq_get_size(seq_id);
2796
  }
2797
 
2798
  size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
 
 
 
 
 
 
 
 
 
 
 
 
2799
  ctx->synchronize();
2800
 
2801
- return ctx->state_seq_get_data(seq_id, dst, size);
2802
  }
2803
 
2804
- size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
2805
  ctx->synchronize();
2806
 
2807
- return ctx->state_seq_set_data(seq_id, src, size);
2808
  }
2809
 
2810
  size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
 
105
 
106
  {
107
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108
+ supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
109
 
110
  if (!supports_set_rows && !cparams.kv_unified) {
111
  LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
 
113
  }
114
  }
115
 
116
+ {
117
+ const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
118
+ graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
119
+
120
+ if (graph_reuse_disable) {
121
+ LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
122
+ }
123
+ }
124
+
125
  const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
126
 
127
  LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
 
725
  // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
726
  const auto gparams = graph_params(res, ubatch, mctx, gtype);
727
 
728
+ if (!graph_reuse_disable && res->can_reuse(gparams)) {
729
  //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
730
 
731
  n_reused++;
 
786
  const auto & hparams = model.hparams;
787
 
788
  const int64_t n_embd = hparams.n_embd;
789
+ const int64_t n_vocab = model.vocab.n_tokens();
790
 
791
  // note: during encode, we always pass the full sequence starting from pos = 0
792
  if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
 
959
  const auto & vocab = model.vocab;
960
  const auto & hparams = model.hparams;
961
 
962
+ const int64_t n_vocab = vocab.n_tokens();
963
  const int64_t n_embd = hparams.n_embd;
964
 
965
  // when computing embeddings, all tokens are output
 
1328
  }
1329
 
1330
  void llama_context::output_reorder() {
1331
+ const uint64_t n_vocab = model.vocab.n_tokens();
1332
  const uint64_t n_embd = model.hparams.n_embd;
1333
 
1334
+ for (size_t s = 0; s < output_swaps.size(); ++s) {
1335
+ const uint64_t i0 = output_swaps[s].i0;
1336
+ const uint64_t i1 = output_swaps[s].i1;
1337
 
1338
  if (logits_size > 0) {
1339
+ for (uint64_t k = 0; k < n_vocab; k++) {
1340
  std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
1341
  }
1342
  }
1343
 
1344
  if (embd_size > 0) {
1345
+ for (uint64_t k = 0; k < n_embd; k++) {
1346
  std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
1347
  }
1348
  }
 
1657
  }
1658
  }
1659
 
1660
+ size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
1661
  llama_io_write_dummy io;
1662
  try {
1663
+ return state_seq_write_data(io, seq_id, flags);
1664
  } catch (const std::exception & err) {
1665
  LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
1666
  return 0;
1667
  }
1668
  }
1669
 
1670
+ size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
1671
  llama_io_write_buffer io(dst, size);
1672
  try {
1673
+ return state_seq_write_data(io, seq_id, flags);
1674
  } catch (const std::exception & err) {
1675
  LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
1676
  return 0;
1677
  }
1678
  }
1679
 
1680
+ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
1681
  llama_io_read_buffer io(src, size);
1682
  try {
1683
+ return state_seq_read_data(io, seq_id, flags);
1684
  } catch (const std::exception & err) {
1685
  LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
1686
  return 0;
 
1778
  {
1779
  const size_t state_size = file.size() - file.tell();
1780
  llama_io_read_file io(&file);
1781
+ const size_t nread = state_seq_read_data(io, seq_id, 0);
1782
  if (!nread) {
1783
  LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
1784
  return 0;
 
1802
 
1803
  // save the context state using stream saving
1804
  llama_io_write_file io(&file);
1805
+ state_seq_write_data(io, seq_id, 0);
1806
 
1807
  const size_t res = file.tell();
1808
  GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
 
1971
  return io.n_bytes();
1972
  }
1973
 
1974
+ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1975
  GGML_UNUSED(seq_id);
1976
 
1977
  if (memory) {
1978
+ memory->state_write(io, seq_id, flags);
1979
  }
1980
 
1981
  return io.n_bytes();
1982
  }
1983
 
1984
+ size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1985
  GGML_UNUSED(seq_id);
1986
 
1987
  if (memory) {
1988
+ memory->state_read(io, seq_id, flags);
1989
  }
1990
 
1991
  return io.n_bytes();
 
2048
  opt_params.opt_period = n_batch / n_ubatch;
2049
  opt_params.get_opt_pars = lopt_params.get_opt_pars;
2050
  opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
2051
+ opt_params.optimizer = lopt_params.optimizer_type;
2052
  opt_ctx = ggml_opt_init(opt_params);
2053
 
2054
  llama_opt_param_filter param_filter = lopt_params.param_filter;
 
2801
  }
2802
 
2803
  size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
2804
+ return llama_state_seq_get_size_ext(ctx, seq_id, 0);
2805
  }
2806
 
2807
  size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
2808
+ return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
2809
+ }
2810
+
2811
+ size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
2812
+ return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
2813
+ }
2814
+
2815
+ size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
2816
+ return ctx->state_seq_get_size(seq_id, flags);
2817
+ }
2818
+
2819
+ size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
2820
  ctx->synchronize();
2821
 
2822
+ return ctx->state_seq_get_data(seq_id, dst, size, flags);
2823
  }
2824
 
2825
+ size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
2826
  ctx->synchronize();
2827
 
2828
+ return ctx->state_seq_set_data(seq_id, src, size, flags);
2829
  }
2830
 
2831
  size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
examples/talk-llama/llama-context.h CHANGED
@@ -111,9 +111,9 @@ struct llama_context {
111
  size_t state_get_data( uint8_t * dst, size_t size);
112
  size_t state_set_data(const uint8_t * src, size_t size);
113
 
114
- size_t state_seq_get_size(llama_seq_id seq_id);
115
- size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
116
- size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
117
 
118
  bool state_load_file(
119
  const char * filepath,
@@ -152,6 +152,7 @@ struct llama_context {
152
 
153
  void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
154
 
 
155
  void opt_epoch(
156
  ggml_opt_dataset_t dataset,
157
  ggml_opt_result_t result_train,
@@ -212,8 +213,8 @@ private:
212
  size_t state_write_data(llama_io_write_i & io);
213
  size_t state_read_data (llama_io_read_i & io);
214
 
215
- size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
216
- size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
217
 
218
  //
219
  // members
@@ -289,7 +290,10 @@ private:
289
 
290
  // env: LLAMA_SET_ROWS (temporary)
291
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
292
- bool supports_set_rows = false;
 
 
 
293
 
294
  // perf
295
  mutable int64_t t_start_us = 0;
 
111
  size_t state_get_data( uint8_t * dst, size_t size);
112
  size_t state_set_data(const uint8_t * src, size_t size);
113
 
114
+ size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
115
+ size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
116
+ size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
117
 
118
  bool state_load_file(
119
  const char * filepath,
 
152
 
153
  void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
154
 
155
+ // TODO: more flexible combinations of logical/physical batch size and context size
156
  void opt_epoch(
157
  ggml_opt_dataset_t dataset,
158
  ggml_opt_result_t result_train,
 
213
  size_t state_write_data(llama_io_write_i & io);
214
  size_t state_read_data (llama_io_read_i & io);
215
 
216
+ size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
217
+ size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
218
 
219
  //
220
  // members
 
290
 
291
  // env: LLAMA_SET_ROWS (temporary)
292
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
293
+ bool supports_set_rows = true;
294
+
295
+ // env: LLAMA_GRAPH_REUSE_DISABLE
296
+ bool graph_reuse_disable = false;
297
 
298
  // perf
299
  mutable int64_t t_start_us = 0;
examples/talk-llama/llama-graph.cpp CHANGED
@@ -188,38 +188,23 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
188
 
189
  void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
190
  const int64_t n_tokens = ubatch->n_tokens;
191
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
192
  const int64_t n_seqs_unq = ubatch->n_seqs_unq;
193
 
194
  if (cparams.embeddings && (
195
- cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
196
- cparams.pooling_type == LLAMA_POOLING_TYPE_RANK
197
- )) {
 
198
  GGML_ASSERT(cls);
199
  GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
200
 
201
  uint32_t * data = (uint32_t *) cls->data;
202
  memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
203
 
204
- for (int i = 0; i < n_tokens; i += n_seq_tokens) {
205
- for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
206
- const llama_seq_id seq_id = ubatch->seq_id[i][s];
207
- const int32_t seq_idx = ubatch->seq_idx[seq_id];
208
-
209
- data[seq_idx] = i;
210
- }
211
- }
212
- }
213
-
214
- if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
215
- GGML_ASSERT(cls);
216
- GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
217
-
218
- uint32_t * data = (uint32_t *) cls->data;
219
- memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
220
 
221
- std::vector<int> last_pos(n_seqs_unq, -1);
222
- std::vector<int> last_row(n_seqs_unq, -1);
223
 
224
  for (int i = 0; i < n_tokens; ++i) {
225
  const llama_pos pos = ubatch->pos[i];
@@ -228,16 +213,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
228
  const llama_seq_id seq_id = ubatch->seq_id[i][s];
229
  const int32_t seq_idx = ubatch->seq_idx[seq_id];
230
 
231
- if (pos >= last_pos[seq_idx]) {
232
- last_pos[seq_idx] = pos;
233
- last_row[seq_idx] = i;
 
 
 
 
234
  }
235
  }
236
  }
237
 
238
  for (int s = 0; s < n_seqs_unq; ++s) {
239
- if (last_row[s] >= 0) {
240
- data[s] = last_row[s];
241
  }
242
  }
243
  }
@@ -751,6 +740,8 @@ ggml_tensor * llm_graph_context::build_ffn(
751
  cur = ggml_reglu(ctx0, cur);
752
  cb(cur, "ffn_reglu", il);
753
  } break;
 
 
754
  }
755
 
756
  if (gate && type_gate == LLM_FFN_PAR) {
@@ -760,8 +751,8 @@ ggml_tensor * llm_graph_context::build_ffn(
760
 
761
  if (down) {
762
  cur = build_lora_mm(down, cur);
763
- if (arch == LLM_ARCH_GLM4) {
764
- // GLM4 seems to have numerical issues with half-precision accumulators
765
  ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
766
  }
767
  }
@@ -796,13 +787,64 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
796
  bool scale_w,
797
  float w_scale,
798
  llama_expert_gating_func_type gating_op,
799
- int il) const {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800
  const int64_t n_embd = cur->ne[0];
801
  const int64_t n_tokens = cur->ne[1];
802
  const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
803
 
804
- ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
805
- cb(logits, "ffn_moe_logits", il);
 
 
 
 
 
 
 
 
 
 
 
806
 
807
  ggml_tensor * probs = nullptr;
808
  switch (gating_op) {
@@ -814,6 +856,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
814
  {
815
  probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
816
  } break;
 
 
 
 
817
  default:
818
  GGML_ABORT("fatal error");
819
  }
@@ -842,6 +888,13 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
842
  ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
843
  cb(weights, "ffn_moe_weights", il);
844
 
 
 
 
 
 
 
 
845
  if (norm_w) {
846
  weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
847
 
@@ -870,6 +923,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
870
  ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
871
  cb(up, "ffn_moe_up", il);
872
 
 
 
 
 
 
873
  ggml_tensor * experts = nullptr;
874
  if (gate_exps) {
875
  cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -878,6 +936,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
878
  cur = up;
879
  }
880
 
 
 
 
 
 
881
  switch (type_op) {
882
  case LLM_FFN_SILU:
883
  if (gate_exps) {
@@ -895,6 +958,22 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
895
  cur = ggml_gelu(ctx0, cur);
896
  cb(cur, "ffn_moe_gelu", il);
897
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
898
  default:
899
  GGML_ABORT("fatal error");
900
  }
@@ -902,6 +981,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
902
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
903
  cb(experts, "ffn_moe_down", il);
904
 
 
 
 
 
 
905
  if (!weight_before_ffn) {
906
  experts = ggml_mul(ctx0, experts, weights);
907
  cb(cur, "ffn_moe_weighted", il);
@@ -1140,6 +1224,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1140
  ggml_tensor * kq_b,
1141
  ggml_tensor * kq_mask,
1142
  ggml_tensor * v_mla,
 
1143
  float kq_scale) const {
1144
  const bool v_trans = v->nb[1] > v->nb[2];
1145
 
@@ -1176,7 +1261,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1176
  cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
1177
  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
1178
 
1179
- ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
 
1180
 
1181
  if (v_mla) {
1182
  #if 0
@@ -1224,6 +1310,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1224
  }
1225
 
1226
  kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
 
1227
 
1228
  if (!v_trans) {
1229
  // note: avoid this branch
@@ -1294,7 +1381,7 @@ ggml_tensor * llm_graph_context::build_attn(
1294
  ggml_tensor * k = k_cur;
1295
  ggml_tensor * v = v_cur;
1296
 
1297
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1298
  cb(cur, "kqv_out", il);
1299
 
1300
  if (wo) {
@@ -1382,13 +1469,13 @@ ggml_tensor * llm_graph_context::build_attn(
1382
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1383
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1384
 
1385
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1386
  cb(cur, "kqv_out", il);
1387
 
1388
  if (wo) {
1389
  cur = build_lora_mm(wo, cur);
1390
- if (arch == LLM_ARCH_GLM4) {
1391
- // GLM4 seems to have numerical issues with half-precision accumulators
1392
  ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
1393
  }
1394
  }
@@ -1411,6 +1498,32 @@ ggml_tensor * llm_graph_context::build_attn(
1411
  ggml_tensor * v_mla,
1412
  float kq_scale,
1413
  int il) const {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1414
  // these nodes are added to the graph together so that they are not reordered
1415
  // by doing so, the number of splits in the graph is reduced
1416
  ggml_build_forward_expand(gf, q_cur);
@@ -1448,7 +1561,7 @@ ggml_tensor * llm_graph_context::build_attn(
1448
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1449
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1450
 
1451
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1452
  cb(cur, "kqv_out", il);
1453
 
1454
  if (wo) {
@@ -1502,7 +1615,7 @@ ggml_tensor * llm_graph_context::build_attn(
1502
  ggml_tensor * k = k_cur;
1503
  ggml_tensor * v = v_cur;
1504
 
1505
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1506
  cb(cur, "kqv_out", il);
1507
 
1508
  if (wo) {
@@ -1561,16 +1674,17 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
1561
 
1562
  ggml_tensor * llm_graph_context::build_rs(
1563
  ggml_tensor * s,
1564
- ggml_tensor * state_copy,
 
1565
  int32_t state_size,
1566
  int32_t n_seqs,
1567
- uint32_t n_kv,
1568
- uint32_t kv_head,
1569
- uint32_t kv_size,
1570
  int32_t rs_zero,
1571
  const llm_graph_get_rows_fn & get_state_rows) const {
1572
 
1573
- ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_size);
1574
 
1575
  // Clear a single state which will then be copied to the other cleared states.
1576
  // Note that this is a no-op when the view is zero-sized.
@@ -1578,39 +1692,44 @@ ggml_tensor * llm_graph_context::build_rs(
1578
  ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
1579
 
1580
  // copy states
1581
- // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
1582
- // {state_size, kv_size} -> {state_size, n_seqs}
1583
- ggml_tensor * output_states = get_state_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
1584
  ggml_build_forward_expand(gf, output_states);
1585
 
1586
- // copy extra states which won't be changed further (between n_seqs and n_kv)
1587
- ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
1588
  ggml_build_forward_expand(gf,
1589
  ggml_cpy(ctx0,
1590
  states_extra,
1591
- ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
1592
 
1593
  return output_states;
1594
  }
1595
 
1596
  static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
1597
  ggml_context * ctx0,
 
1598
  const llama_memory_recurrent_context * mctx_cur) {
1599
 
1600
  auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
1601
 
1602
- const auto n_rs = mctx_cur->get_n_rs();
 
1603
 
1604
  inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
1605
  ggml_set_input(inp->s_copy);
1606
 
 
 
 
1607
  return inp;
1608
  }
1609
 
1610
  llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
1611
  const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
1612
 
1613
- auto inp = build_rs_inp_impl(ctx0, mctx_cur);
1614
 
1615
  return (llm_graph_input_rs *) res->add_input(std::move(inp));
1616
  }
@@ -1623,7 +1742,9 @@ ggml_tensor * llm_graph_context::build_rs(
1623
  const llm_graph_get_rows_fn & get_state_rows) const {
1624
  const auto * kv_state = inp->mctx;
1625
 
1626
- return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
 
 
1627
  }
1628
 
1629
  ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
@@ -1670,7 +1791,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
1670
  llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1671
  const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
1672
 
1673
- auto inp_rs = build_rs_inp_impl(ctx0, mctx_cur->get_recr());
1674
  auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
1675
 
1676
  auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
 
188
 
189
  void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
190
  const int64_t n_tokens = ubatch->n_tokens;
 
191
  const int64_t n_seqs_unq = ubatch->n_seqs_unq;
192
 
193
  if (cparams.embeddings && (
194
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
195
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
196
+ cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
197
+ )) {
198
  GGML_ASSERT(cls);
199
  GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
200
 
201
  uint32_t * data = (uint32_t *) cls->data;
202
  memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
203
 
204
+ std::vector<int> target_pos(n_seqs_unq, -1);
205
+ std::vector<int> target_row(n_seqs_unq, -1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+ bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
 
208
 
209
  for (int i = 0; i < n_tokens; ++i) {
210
  const llama_pos pos = ubatch->pos[i];
 
213
  const llama_seq_id seq_id = ubatch->seq_id[i][s];
214
  const int32_t seq_idx = ubatch->seq_idx[seq_id];
215
 
216
+ if (
217
+ (target_pos[seq_idx] == -1) ||
218
+ ( last && pos >= target_pos[seq_idx]) ||
219
+ (!last && pos < target_pos[seq_idx])
220
+ ) {
221
+ target_pos[seq_idx] = pos;
222
+ target_row[seq_idx] = i;
223
  }
224
  }
225
  }
226
 
227
  for (int s = 0; s < n_seqs_unq; ++s) {
228
+ if (target_row[s] >= 0) {
229
+ data[s] = target_row[s];
230
  }
231
  }
232
  }
 
740
  cur = ggml_reglu(ctx0, cur);
741
  cb(cur, "ffn_reglu", il);
742
  } break;
743
+ default:
744
+ GGML_ABORT("fatal error");
745
  }
746
 
747
  if (gate && type_gate == LLM_FFN_PAR) {
 
751
 
752
  if (down) {
753
  cur = build_lora_mm(down, cur);
754
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
755
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
756
  ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
757
  }
758
  }
 
787
  bool scale_w,
788
  float w_scale,
789
  llama_expert_gating_func_type gating_op,
790
+ int il,
791
+ ggml_tensor * probs_in) const {
792
+ return build_moe_ffn(
793
+ cur,
794
+ gate_inp, /* gate_inp_b */ nullptr,
795
+ up_exps, /* up_exps_b */ nullptr,
796
+ gate_exps, /* gate_exps_b */ nullptr,
797
+ down_exps, /* down_exps_b */ nullptr,
798
+ exp_probs_b,
799
+ n_expert,
800
+ n_expert_used,
801
+ type_op,
802
+ norm_w,
803
+ scale_w,
804
+ w_scale,
805
+ gating_op,
806
+ il,
807
+ probs_in
808
+ );
809
+ }
810
+
811
+ ggml_tensor * llm_graph_context::build_moe_ffn(
812
+ ggml_tensor * cur,
813
+ ggml_tensor * gate_inp,
814
+ ggml_tensor * gate_inp_b,
815
+ ggml_tensor * up_exps,
816
+ ggml_tensor * up_exps_b,
817
+ ggml_tensor * gate_exps,
818
+ ggml_tensor * gate_exps_b,
819
+ ggml_tensor * down_exps,
820
+ ggml_tensor * down_exps_b,
821
+ ggml_tensor * exp_probs_b,
822
+ int64_t n_expert,
823
+ int64_t n_expert_used,
824
+ llm_ffn_op_type type_op,
825
+ bool norm_w,
826
+ bool scale_w,
827
+ float w_scale,
828
+ llama_expert_gating_func_type gating_op,
829
+ int il,
830
+ ggml_tensor * probs_in) const {
831
  const int64_t n_embd = cur->ne[0];
832
  const int64_t n_tokens = cur->ne[1];
833
  const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
834
 
835
+ ggml_tensor * logits = nullptr;
836
+
837
+ if (probs_in == nullptr) {
838
+ logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
839
+ cb(logits, "ffn_moe_logits", il);
840
+ } else {
841
+ logits = probs_in;
842
+ }
843
+
844
+ if (gate_inp_b) {
845
+ logits = ggml_add(ctx0, logits, gate_inp_b);
846
+ cb(logits, "ffn_moe_logits_biased", il);
847
+ }
848
 
849
  ggml_tensor * probs = nullptr;
850
  switch (gating_op) {
 
856
  {
857
  probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
858
  } break;
859
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
860
+ {
861
+ probs = logits; // [n_expert, n_tokens]
862
+ } break;
863
  default:
864
  GGML_ABORT("fatal error");
865
  }
 
888
  ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
889
  cb(weights, "ffn_moe_weights", il);
890
 
891
+ if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
892
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
893
+ weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
894
+ weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
895
+ cb(weights, "ffn_moe_weights_softmax", il);
896
+ }
897
+
898
  if (norm_w) {
899
  weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
900
 
 
923
  ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
924
  cb(up, "ffn_moe_up", il);
925
 
926
+ if (up_exps_b) {
927
+ up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
928
+ cb(up, "ffn_moe_up_biased", il);
929
+ }
930
+
931
  ggml_tensor * experts = nullptr;
932
  if (gate_exps) {
933
  cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
 
936
  cur = up;
937
  }
938
 
939
+ if (gate_exps_b) {
940
+ cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
941
+ cb(cur, "ffn_moe_gate_biased", il);
942
+ }
943
+
944
  switch (type_op) {
945
  case LLM_FFN_SILU:
946
  if (gate_exps) {
 
958
  cur = ggml_gelu(ctx0, cur);
959
  cb(cur, "ffn_moe_gelu", il);
960
  } break;
961
+ case LLM_FFN_SWIGLU_OAI_MOE:
962
+ {
963
+ // TODO: move to hparams?
964
+ constexpr float alpha = 1.702f;
965
+ constexpr float limit = 7.0f;
966
+ cur = ggml_swiglu_oai(ctx0, cur, up, alpha, limit);
967
+ cb(cur, "ffn_moe_swiglu_oai", il);
968
+ } break;
969
+ case LLM_FFN_RELU:
970
+ if (gate_exps) {
971
+ cur = ggml_reglu_split(ctx0, cur, up);
972
+ cb(cur, "ffn_moe_reglu", il);
973
+ } else {
974
+ cur = ggml_relu(ctx0, cur);
975
+ cb(cur, "ffn_moe_relu", il);
976
+ } break;
977
  default:
978
  GGML_ABORT("fatal error");
979
  }
 
981
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
982
  cb(experts, "ffn_moe_down", il);
983
 
984
+ if (down_exps_b) {
985
+ experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
986
+ cb(experts, "ffn_moe_down_biased", il);
987
+ }
988
+
989
  if (!weight_before_ffn) {
990
  experts = ggml_mul(ctx0, experts, weights);
991
  cb(cur, "ffn_moe_weighted", il);
 
1224
  ggml_tensor * kq_b,
1225
  ggml_tensor * kq_mask,
1226
  ggml_tensor * v_mla,
1227
+ ggml_tensor * sinks,
1228
  float kq_scale) const {
1229
  const bool v_trans = v->nb[1] > v->nb[2];
1230
 
 
1261
  cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
1262
  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
1263
 
1264
+ ggml_flash_attn_ext_add_sinks(cur, sinks);
1265
+ ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
1266
 
1267
  if (v_mla) {
1268
  #if 0
 
1310
  }
1311
 
1312
  kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
1313
+ ggml_soft_max_add_sinks(kq, sinks);
1314
 
1315
  if (!v_trans) {
1316
  // note: avoid this branch
 
1381
  ggml_tensor * k = k_cur;
1382
  ggml_tensor * v = v_cur;
1383
 
1384
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
1385
  cb(cur, "kqv_out", il);
1386
 
1387
  if (wo) {
 
1469
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1470
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1471
 
1472
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
1473
  cb(cur, "kqv_out", il);
1474
 
1475
  if (wo) {
1476
  cur = build_lora_mm(wo, cur);
1477
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
1478
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
1479
  ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
1480
  }
1481
  }
 
1498
  ggml_tensor * v_mla,
1499
  float kq_scale,
1500
  int il) const {
1501
+ return build_attn_with_sinks(
1502
+ inp,
1503
+ wo,
1504
+ wo_b,
1505
+ q_cur,
1506
+ k_cur,
1507
+ v_cur,
1508
+ kq_b,
1509
+ v_mla,
1510
+ nullptr,
1511
+ kq_scale,
1512
+ il);
1513
+ }
1514
+
1515
+ ggml_tensor * llm_graph_context::build_attn_with_sinks(
1516
+ llm_graph_input_attn_kv_unified_iswa * inp,
1517
+ ggml_tensor * wo,
1518
+ ggml_tensor * wo_b,
1519
+ ggml_tensor * q_cur,
1520
+ ggml_tensor * k_cur,
1521
+ ggml_tensor * v_cur,
1522
+ ggml_tensor * kq_b,
1523
+ ggml_tensor * v_mla,
1524
+ ggml_tensor * sinks,
1525
+ float kq_scale,
1526
+ int il) const {
1527
  // these nodes are added to the graph together so that they are not reordered
1528
  // by doing so, the number of splits in the graph is reduced
1529
  ggml_build_forward_expand(gf, q_cur);
 
1561
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1562
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1563
 
1564
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
1565
  cb(cur, "kqv_out", il);
1566
 
1567
  if (wo) {
 
1615
  ggml_tensor * k = k_cur;
1616
  ggml_tensor * v = v_cur;
1617
 
1618
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
1619
  cb(cur, "kqv_out", il);
1620
 
1621
  if (wo) {
 
1674
 
1675
  ggml_tensor * llm_graph_context::build_rs(
1676
  ggml_tensor * s,
1677
+ ggml_tensor * state_copy_main,
1678
+ ggml_tensor * state_copy_extra,
1679
  int32_t state_size,
1680
  int32_t n_seqs,
1681
+ uint32_t n_rs,
1682
+ uint32_t rs_head,
1683
+ uint32_t rs_size,
1684
  int32_t rs_zero,
1685
  const llm_graph_get_rows_fn & get_state_rows) const {
1686
 
1687
+ ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
1688
 
1689
  // Clear a single state which will then be copied to the other cleared states.
1690
  // Note that this is a no-op when the view is zero-sized.
 
1692
  ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
1693
 
1694
  // copy states
1695
+ // NOTE: assuming the copy destinations are ALL contained between rs_head and rs_head + n_rs
1696
+ // {state_size, rs_size} -> {state_size, n_seqs}
1697
+ ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
1698
  ggml_build_forward_expand(gf, output_states);
1699
 
1700
+ // copy extra states which won't be changed further (between n_seqs and n_rs)
1701
+ ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
1702
  ggml_build_forward_expand(gf,
1703
  ggml_cpy(ctx0,
1704
  states_extra,
1705
+ ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
1706
 
1707
  return output_states;
1708
  }
1709
 
1710
  static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
1711
  ggml_context * ctx0,
1712
+ const llama_ubatch & ubatch,
1713
  const llama_memory_recurrent_context * mctx_cur) {
1714
 
1715
  auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
1716
 
1717
+ const int64_t n_rs = mctx_cur->get_n_rs();
1718
+ const int64_t n_seqs = ubatch.n_seqs;
1719
 
1720
  inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
1721
  ggml_set_input(inp->s_copy);
1722
 
1723
+ inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
1724
+ inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
1725
+
1726
  return inp;
1727
  }
1728
 
1729
  llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
1730
  const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
1731
 
1732
+ auto inp = build_rs_inp_impl(ctx0, ubatch, mctx_cur);
1733
 
1734
  return (llm_graph_input_rs *) res->add_input(std::move(inp));
1735
  }
 
1742
  const llm_graph_get_rows_fn & get_state_rows) const {
1743
  const auto * kv_state = inp->mctx;
1744
 
1745
+ return build_rs(s, inp->s_copy_main, inp->s_copy_extra, state_size, n_seqs,
1746
+ kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(),
1747
+ get_state_rows);
1748
  }
1749
 
1750
  ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
 
1791
  llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1792
  const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
1793
 
1794
+ auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
1795
  auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
1796
 
1797
  auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
examples/talk-llama/llama-graph.h CHANGED
@@ -39,6 +39,7 @@ enum llm_ffn_op_type {
39
  LLM_FFN_SWIGLU,
40
  LLM_FFN_GEGLU,
41
  LLM_FFN_REGLU,
 
42
  };
43
 
44
  enum llm_ffn_gate_type {
@@ -144,7 +145,7 @@ public:
144
 
145
  ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
146
 
147
- const llama_hparams & hparams;
148
  };
149
 
150
  class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +159,7 @@ public:
158
 
159
  ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
160
 
161
- const llama_hparams & hparams;
162
 
163
  const llama_kv_cache_unified_context * mctx;
164
  };
@@ -177,8 +178,8 @@ public:
177
 
178
  ggml_tensor * out_ids; // I32 [n_outputs]
179
 
180
- const llama_hparams & hparams;
181
- const llama_cparams & cparams;
182
 
183
  const uint32_t n_outputs;
184
  };
@@ -192,7 +193,7 @@ public:
192
 
193
  ggml_tensor * mean; // F32 [n_batch, n_batch]
194
 
195
- const llama_cparams & cparams;
196
  };
197
 
198
  class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +205,7 @@ public:
204
 
205
  ggml_tensor * cls; // I32 [n_batch]
206
 
207
- const llama_cparams & cparams;
208
  };
209
 
210
  class llm_graph_input_rs : public llm_graph_input_i {
@@ -214,7 +215,12 @@ public:
214
 
215
  void set_input(const llama_ubatch * ubatch) override;
216
 
217
- ggml_tensor * s_copy; // I32 [kv_size]
 
 
 
 
 
218
 
219
  const llama_memory_recurrent_context * mctx;
220
  };
@@ -247,8 +253,8 @@ public:
247
  ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
248
  ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
249
 
250
- const llama_hparams & hparams;
251
- const llama_cparams & cparams;
252
  };
253
 
254
  class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +284,11 @@ public:
278
  ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279
  ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
280
 
281
- const llama_hparams & hparams;
282
- const llama_cparams & cparams;
 
 
 
283
 
284
  const llama_kv_cache_unified_context * mctx;
285
  };
@@ -318,8 +327,8 @@ public:
318
  ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319
  ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
320
 
321
- const llama_hparams & hparams;
322
- const llama_cparams & cparams;
323
 
324
  const llama_kv_cache_unified_iswa_context * mctx;
325
  };
@@ -415,7 +424,9 @@ struct llm_graph_params {
415
  (!ubatch.embd && !other.ubatch.embd)
416
  );
417
 
418
- if (can_reuse_ubatch && !ubatch.equal_seqs()) {
 
 
419
  if (!ubatch.data) {
420
  // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
421
  // therefore we cannot perform the sequence id check. normally should never happen
@@ -609,6 +620,7 @@ struct llm_graph_context {
609
  llm_ffn_gate_type type_gate,
610
  int il) const;
611
 
 
612
  ggml_tensor * build_moe_ffn(
613
  ggml_tensor * cur,
614
  ggml_tensor * gate_inp,
@@ -623,7 +635,29 @@ struct llm_graph_context {
623
  bool scale_w,
624
  float w_scale,
625
  llama_expert_gating_func_type gating_op,
626
- int il) const;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
 
628
  //
629
  // inputs
@@ -651,6 +685,7 @@ struct llm_graph_context {
651
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
652
  ggml_tensor * kq_b,
653
  ggml_tensor * kq_mask,
 
654
  ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
655
  float kq_scale) const;
656
 
@@ -697,6 +732,20 @@ struct llm_graph_context {
697
  float kq_scale,
698
  int il) const;
699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  llm_graph_input_attn_cross * build_attn_inp_cross() const;
701
 
702
  ggml_tensor * build_attn(
@@ -715,7 +764,6 @@ struct llm_graph_context {
715
  // recurrent
716
  //
717
 
718
- // TODO: avoid notion of "kv"
719
  // TODO: move this implementation to llama_memory_recurrent.
720
  // this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
721
  // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
@@ -723,12 +771,13 @@ struct llm_graph_context {
723
  // `llama_memory_recurrent`
724
  ggml_tensor * build_rs(
725
  ggml_tensor * s,
726
- ggml_tensor * state_copy,
 
727
  int32_t state_size,
728
  int32_t n_seqs,
729
- uint32_t n_kv,
730
- uint32_t kv_head,
731
- uint32_t kv_size,
732
  int32_t rs_zero,
733
  const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
734
 
 
39
  LLM_FFN_SWIGLU,
40
  LLM_FFN_GEGLU,
41
  LLM_FFN_REGLU,
42
+ LLM_FFN_SWIGLU_OAI_MOE,
43
  };
44
 
45
  enum llm_ffn_gate_type {
 
145
 
146
  ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
147
 
148
+ const llama_hparams hparams;
149
  };
150
 
151
  class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 
159
 
160
  ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
161
 
162
+ const llama_hparams hparams;
163
 
164
  const llama_kv_cache_unified_context * mctx;
165
  };
 
178
 
179
  ggml_tensor * out_ids; // I32 [n_outputs]
180
 
181
+ const llama_hparams hparams;
182
+ const llama_cparams cparams;
183
 
184
  const uint32_t n_outputs;
185
  };
 
193
 
194
  ggml_tensor * mean; // F32 [n_batch, n_batch]
195
 
196
+ const llama_cparams cparams;
197
  };
198
 
199
  class llm_graph_input_cls : public llm_graph_input_i {
 
205
 
206
  ggml_tensor * cls; // I32 [n_batch]
207
 
208
+ const llama_cparams cparams;
209
  };
210
 
211
  class llm_graph_input_rs : public llm_graph_input_i {
 
215
 
216
  void set_input(const llama_ubatch * ubatch) override;
217
 
218
+ ggml_tensor * s_copy; // I32 [n_rs]
219
+
220
+ // views of s_copy, computed once per graph
221
+ // and shared across layers which use build_rs
222
+ ggml_tensor * s_copy_main; // I32 [n_seqs]
223
+ ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
224
 
225
  const llama_memory_recurrent_context * mctx;
226
  };
 
253
  ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
254
  ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
255
 
256
+ const llama_hparams hparams;
257
+ const llama_cparams cparams;
258
  };
259
 
260
  class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
 
284
  ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
285
  ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
286
 
287
+ // note: these have to be copies because in order to be able to reuse a graph, its inputs
288
+ // need to carry these parameters with them. otherwise, they can point to freed
289
+ // llm_graph_params from a previous batch, causing stack-use-after-return
290
+ const llama_hparams hparams;
291
+ const llama_cparams cparams;
292
 
293
  const llama_kv_cache_unified_context * mctx;
294
  };
 
327
  ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
328
  ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
329
 
330
+ const llama_hparams hparams;
331
+ const llama_cparams cparams;
332
 
333
  const llama_kv_cache_unified_iswa_context * mctx;
334
  };
 
424
  (!ubatch.embd && !other.ubatch.embd)
425
  );
426
 
427
+ // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
428
+ // the reason is because the set of attention streams would be different for different sequences
429
+ if (can_reuse_ubatch && ubatch.equal_seqs()) {
430
  if (!ubatch.data) {
431
  // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
432
  // therefore we cannot perform the sequence id check. normally should never happen
 
620
  llm_ffn_gate_type type_gate,
621
  int il) const;
622
 
623
+ // build MoE FFN without bias tensors
624
  ggml_tensor * build_moe_ffn(
625
  ggml_tensor * cur,
626
  ggml_tensor * gate_inp,
 
635
  bool scale_w,
636
  float w_scale,
637
  llama_expert_gating_func_type gating_op,
638
+ int il,
639
+ ggml_tensor * probs_in = nullptr) const;
640
+
641
+ ggml_tensor * build_moe_ffn(
642
+ ggml_tensor * cur,
643
+ ggml_tensor * gate_inp,
644
+ ggml_tensor * gate_inp_b,
645
+ ggml_tensor * up_exps,
646
+ ggml_tensor * up_exps_b,
647
+ ggml_tensor * gate_exps,
648
+ ggml_tensor * gate_exps_b,
649
+ ggml_tensor * down_exps,
650
+ ggml_tensor * down_exps_b,
651
+ ggml_tensor * exp_probs_b,
652
+ int64_t n_expert,
653
+ int64_t n_expert_used,
654
+ llm_ffn_op_type type_op,
655
+ bool norm_w,
656
+ bool scale_w,
657
+ float w_scale,
658
+ llama_expert_gating_func_type gating_op,
659
+ int il,
660
+ ggml_tensor * probs_in = nullptr) const;
661
 
662
  //
663
  // inputs
 
685
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
686
  ggml_tensor * kq_b,
687
  ggml_tensor * kq_mask,
688
+ ggml_tensor * sinks,
689
  ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
690
  float kq_scale) const;
691
 
 
732
  float kq_scale,
733
  int il) const;
734
 
735
+ // TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
736
+ ggml_tensor * build_attn_with_sinks(
737
+ llm_graph_input_attn_kv_unified_iswa * inp,
738
+ ggml_tensor * wo,
739
+ ggml_tensor * wo_b,
740
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
741
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
742
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
743
+ ggml_tensor * kq_b,
744
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
745
+ ggml_tensor * sinks, // [n_head_q]
746
+ float kq_scale,
747
+ int il) const;
748
+
749
  llm_graph_input_attn_cross * build_attn_inp_cross() const;
750
 
751
  ggml_tensor * build_attn(
 
764
  // recurrent
765
  //
766
 
 
767
  // TODO: move this implementation to llama_memory_recurrent.
768
  // this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
769
  // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
 
771
  // `llama_memory_recurrent`
772
  ggml_tensor * build_rs(
773
  ggml_tensor * s,
774
+ ggml_tensor * state_copy_main,
775
+ ggml_tensor * state_copy_extra,
776
  int32_t state_size,
777
  int32_t n_seqs,
778
+ uint32_t n_rs,
779
+ uint32_t rs_head,
780
+ uint32_t rs_size,
781
  int32_t rs_zero,
782
  const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
783
 
examples/talk-llama/llama-hparams.cpp CHANGED
@@ -2,9 +2,15 @@
2
 
3
  #include "ggml.h"
4
 
5
- void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
6
- for (uint32_t il = 0; il < n_layer; ++il) {
7
- swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
 
 
 
 
 
 
8
  }
9
  }
10
 
 
2
 
3
  #include "ggml.h"
4
 
5
+ void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
6
+ if (dense_first) {
7
+ for (uint32_t il = 0; il < n_layer; ++il) {
8
+ swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
9
+ }
10
+ } else {
11
+ for (uint32_t il = 0; il < n_layer; ++il) {
12
+ swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
13
+ }
14
  }
15
  }
16
 
examples/talk-llama/llama-hparams.h CHANGED
@@ -9,9 +9,10 @@
9
  #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
10
 
11
  enum llama_expert_gating_func_type {
12
- LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
 
15
  };
16
 
17
  enum llama_swa_type {
@@ -73,6 +74,7 @@ struct llama_hparams {
73
  bool expert_weights_norm = false;
74
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
75
  uint32_t moe_every_n_layers = 0;
 
76
 
77
  float f_norm_eps;
78
  float f_norm_rms_eps;
@@ -140,7 +142,7 @@ struct llama_hparams {
140
  // for Classifiers
141
  uint32_t n_cls_out = 1;
142
 
143
- // llama4
144
  uint32_t n_moe_layer_step = 0;
145
  uint32_t n_no_rope_layer_step = 4;
146
  uint32_t n_attn_temp_floor_scale = 8192;
@@ -161,9 +163,10 @@ struct llama_hparams {
161
  enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
162
 
163
  // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
 
164
  // note that if n_pattern == 0, all layers are SWA
165
  // if n_pattern == 1, all layers are dense
166
- // example: n_pattern = 3
167
  // il == 0: swa
168
  // il == 1: swa
169
  // il == 2: dense
@@ -172,7 +175,13 @@ struct llama_hparams {
172
  // il == 5: dense
173
  // il == 6: swa
174
  // etc ...
175
- void set_swa_pattern(uint32_t n_pattern);
 
 
 
 
 
 
176
 
177
  // return true if one of the layers is SWA
178
  bool is_swa_any() const;
 
9
  #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
10
 
11
  enum llama_expert_gating_func_type {
12
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
15
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
16
  };
17
 
18
  enum llama_swa_type {
 
74
  bool expert_weights_norm = false;
75
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
76
  uint32_t moe_every_n_layers = 0;
77
+ uint32_t nextn_predict_layers = 0;
78
 
79
  float f_norm_eps;
80
  float f_norm_rms_eps;
 
142
  // for Classifiers
143
  uint32_t n_cls_out = 1;
144
 
145
+ // llama4 smallthinker
146
  uint32_t n_moe_layer_step = 0;
147
  uint32_t n_no_rope_layer_step = 4;
148
  uint32_t n_attn_temp_floor_scale = 8192;
 
163
  enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
164
 
165
  // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
166
+ // dense_first means whether the pattern is start with a dense layer
167
  // note that if n_pattern == 0, all layers are SWA
168
  // if n_pattern == 1, all layers are dense
169
+ // example 1: n_pattern = 3, dense_first = false
170
  // il == 0: swa
171
  // il == 1: swa
172
  // il == 2: dense
 
175
  // il == 5: dense
176
  // il == 6: swa
177
  // etc ...
178
+ // example 2: n_pattern = 2, dense_first = true
179
+ // il == 0: dense
180
+ // il == 1: swa
181
+ // il == 2: dense
182
+ // il == 3: swa
183
+ // etc ...
184
+ void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
185
 
186
  // return true if one of the layers is SWA
187
  bool is_swa_any() const;
examples/talk-llama/llama-kv-cache-unified-iswa.cpp CHANGED
@@ -194,14 +194,20 @@ bool llama_kv_cache_unified_iswa::get_can_shift() const {
194
  return kv_base->get_size() == kv_swa->get_size();
195
  }
196
 
197
- void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
198
- kv_base->state_write(io, seq_id);
199
- kv_swa ->state_write(io, seq_id);
 
 
 
200
  }
201
 
202
- void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
203
- kv_base->state_read(io, seq_id);
204
- kv_swa ->state_read(io, seq_id);
 
 
 
205
  }
206
 
207
  llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
 
194
  return kv_base->get_size() == kv_swa->get_size();
195
  }
196
 
197
+ void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
198
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
199
+ kv_base->state_write(io, seq_id, flags);
200
+ }
201
+
202
+ kv_swa->state_write(io, seq_id, flags);
203
  }
204
 
205
+ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
206
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
207
+ kv_base->state_read(io, seq_id, flags);
208
+ }
209
+
210
+ kv_swa->state_read(io, seq_id, flags);
211
  }
212
 
213
  llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
examples/talk-llama/llama-kv-cache-unified-iswa.h CHANGED
@@ -56,8 +56,8 @@ public:
56
 
57
  // state write/load
58
 
59
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
60
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
61
 
62
  //
63
  // llama_kv_cache_unified_iswa specific API
 
56
 
57
  // state write/load
58
 
59
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
60
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
61
 
62
  //
63
  // llama_kv_cache_unified_iswa specific API
examples/talk-llama/llama-kv-cache-unified.cpp CHANGED
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
39
  if (model.arch == LLM_ARCH_GEMMA3N) {
40
  n_layer_cache = 20;
41
  }
 
 
 
 
42
 
43
  // create a context for each buffer type
44
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -183,7 +187,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
183
  const size_t memory_size_k = size_k_bytes();
184
  const size_t memory_size_v = size_v_bytes();
185
 
186
- LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
187
  (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
188
  ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
189
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
@@ -193,7 +197,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
193
  debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
194
 
195
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
196
- supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
197
 
198
  if (!supports_set_rows) {
199
  // ref: https://github.com/ggml-org/llama.cpp/pull/14363
@@ -219,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
219
  }
220
 
221
  bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
222
- GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
223
-
224
- auto & cells = v_cells[seq_to_stream[seq_id]];
225
- auto & head = v_heads[seq_to_stream[seq_id]];
226
-
227
- uint32_t new_head = cells.size();
228
 
229
  if (p0 < 0) {
230
  p0 = 0;
@@ -235,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
235
  }
236
 
237
  if (seq_id >= 0) {
 
 
 
 
 
238
  for (uint32_t i = 0; i < cells.size(); ++i) {
239
  if (!cells.pos_in(i, p0, p1)) {
240
  continue;
@@ -246,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
246
  }
247
  }
248
  }
 
 
 
 
 
249
  } else {
250
  // match any sequence
251
- for (uint32_t i = 0; i < cells.size(); ++i) {
252
- if (!cells.pos_in(i, p0, p1)) {
253
- continue;
254
- }
255
 
256
- cells.rm(i);
257
 
258
- if (new_head == cells.size()) {
259
- new_head = i;
 
 
 
 
 
 
 
 
260
  }
261
- }
262
- }
263
 
264
- // If we freed up a slot, set head to it so searching can start there.
265
- if (new_head != cells.size() && new_head < head) {
266
- head = new_head;
 
 
267
  }
268
 
269
  return true;
@@ -734,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
734
  }
735
 
736
  llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
737
- if (debug > 0) {
738
- const auto & cells = v_cells[seq_to_stream[1]];
739
 
740
- const uint32_t head_cur = v_heads[1];
741
-
742
- LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
743
- __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
 
 
 
 
 
 
 
 
 
 
 
 
 
744
 
745
- if ((debug == 2 && n_swa > 0) || debug > 2) {
746
- std::string ss;
747
- for (uint32_t i = 0; i < cells.size(); ++i) {
748
- if (cells.is_empty(i)) {
749
- ss += '.';
750
- } else {
751
- assert(cells.seq_count(i) >= 1);
 
 
 
 
 
 
752
 
753
- if (cells.seq_count(i) == 1) {
754
- ss += std::to_string(cells.seq_get(i));
 
 
 
 
755
  } else {
756
- ss += 'M';
 
 
 
 
 
 
 
 
 
 
 
757
  }
758
  }
759
- if (i%256 == 255) {
760
- ss += " *";
761
- ss += '\n';
762
- }
763
  }
764
- LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
765
- }
766
 
767
- if ((debug == 2 && n_swa > 0) || debug > 2) {
768
- std::string ss;
769
- for (uint32_t i = 0; i < cells.size(); ++i) {
770
- std::string cur;
771
- if (cells.is_empty(i)) {
772
- cur = '.';
773
- } else {
774
- cur = std::to_string(cells.pos_get(i));
775
- }
776
- const int n = cur.size();
777
- for (int j = 0; j < 5 - n; ++j) {
778
- cur += ' ';
779
- }
780
- ss += cur;
781
- if (i%256 == 255) {
782
- ss += " *";
783
- }
784
- if (i%64 == 63) {
785
- ss += '\n';
786
  }
787
- }
788
- LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
789
- }
790
 
791
- for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
792
- if (cells.seq_pos_min(s) < 0) {
793
- continue;
794
  }
795
-
796
- LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
797
  }
798
  }
799
 
@@ -1808,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1808
  return false;
1809
  }
1810
 
1811
- void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
 
 
1812
  io.write(&n_stream, sizeof(n_stream));
1813
 
1814
  for (uint32_t s = 0; s < n_stream; ++s) {
@@ -1859,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
1859
  }
1860
  }
1861
 
1862
- void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
 
 
1863
  GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
1864
 
1865
  uint32_t n_stream_cur;
 
39
  if (model.arch == LLM_ARCH_GEMMA3N) {
40
  n_layer_cache = 20;
41
  }
42
+ if (model.arch == LLM_ARCH_GLM4_MOE) {
43
+ // GLM-4.5: Only process up to last layer, skip final NextN layer
44
+ n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
45
+ }
46
 
47
  // create a context for each buffer type
48
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
 
187
  const size_t memory_size_k = size_k_bytes();
188
  const size_t memory_size_v = size_v_bytes();
189
 
190
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
191
  (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
192
  ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
193
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
 
197
  debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
198
 
199
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
200
+ supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
201
 
202
  if (!supports_set_rows) {
203
  // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 
223
  }
224
 
225
  bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
226
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
 
 
 
 
 
227
 
228
  if (p0 < 0) {
229
  p0 = 0;
 
234
  }
235
 
236
  if (seq_id >= 0) {
237
+ auto & cells = v_cells[seq_to_stream[seq_id]];
238
+ auto & head = v_heads[seq_to_stream[seq_id]];
239
+
240
+ uint32_t new_head = cells.size();
241
+
242
  for (uint32_t i = 0; i < cells.size(); ++i) {
243
  if (!cells.pos_in(i, p0, p1)) {
244
  continue;
 
250
  }
251
  }
252
  }
253
+
254
+ // If we freed up a slot, set head to it so searching can start there.
255
+ if (new_head != cells.size() && new_head < head) {
256
+ head = new_head;
257
+ }
258
  } else {
259
  // match any sequence
260
+ for (uint32_t s = 0; s < n_stream; ++s) {
261
+ auto & cells = v_cells[s];
262
+ auto & head = v_heads[s];
 
263
 
264
+ uint32_t new_head = cells.size();
265
 
266
+ for (uint32_t i = 0; i < cells.size(); ++i) {
267
+ if (!cells.pos_in(i, p0, p1)) {
268
+ continue;
269
+ }
270
+
271
+ cells.rm(i);
272
+
273
+ if (new_head == cells.size()) {
274
+ new_head = i;
275
+ }
276
  }
 
 
277
 
278
+ // If we freed up a slot, set head to it so searching can start there.
279
+ if (new_head != cells.size() && new_head < head) {
280
+ head = new_head;
281
+ }
282
+ }
283
  }
284
 
285
  return true;
 
750
  }
751
 
752
  llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
 
 
753
 
754
+ if (debug > 0) {
755
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
756
+ const auto seq_id = ubatch.seq_id_unq[s];
757
+ const auto stream_id = seq_to_stream[seq_id];
758
+ const auto & cells = v_cells[stream_id];
759
+ const uint32_t head_cur = v_heads[stream_id];
760
+
761
+ LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
762
+ __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
763
+
764
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
765
+ std::string ss;
766
+ for (uint32_t i = 0; i < cells.size(); ++i) {
767
+ if (cells.is_empty(i)) {
768
+ ss += '.';
769
+ } else {
770
+ assert(cells.seq_count(i) >= 1);
771
 
772
+ if (cells.seq_count(i) == 1) {
773
+ ss += std::to_string(cells.seq_get(i));
774
+ } else {
775
+ ss += 'M';
776
+ }
777
+ }
778
+ if (i%256 == 255) {
779
+ ss += " *";
780
+ ss += '\n';
781
+ }
782
+ }
783
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
784
+ }
785
 
786
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
787
+ std::string ss;
788
+ for (uint32_t i = 0; i < cells.size(); ++i) {
789
+ std::string cur;
790
+ if (cells.is_empty(i)) {
791
+ cur = '.';
792
  } else {
793
+ cur = std::to_string(cells.pos_get(i));
794
+ }
795
+ const int n = cur.size();
796
+ for (int j = 0; j < 5 - n; ++j) {
797
+ cur += ' ';
798
+ }
799
+ ss += cur;
800
+ if (i%256 == 255) {
801
+ ss += " *";
802
+ }
803
+ if (i%64 == 63) {
804
+ ss += '\n';
805
  }
806
  }
807
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
 
 
 
808
  }
 
 
809
 
810
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
811
+ if (cells.seq_pos_min(s) < 0) {
812
+ continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
  }
 
 
 
814
 
815
+ LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
 
 
816
  }
 
 
817
  }
818
  }
819
 
 
1828
  return false;
1829
  }
1830
 
1831
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1832
+ GGML_UNUSED(flags);
1833
+
1834
  io.write(&n_stream, sizeof(n_stream));
1835
 
1836
  for (uint32_t s = 0; s < n_stream; ++s) {
 
1881
  }
1882
  }
1883
 
1884
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1885
+ GGML_UNUSED(flags);
1886
+
1887
  GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
1888
 
1889
  uint32_t n_stream_cur;
examples/talk-llama/llama-kv-cache-unified.h CHANGED
@@ -136,8 +136,8 @@ public:
136
 
137
  // state write/load
138
 
139
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
140
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
141
 
142
  //
143
  // llama_kv_cache_unified specific API
@@ -230,7 +230,7 @@ private:
230
 
231
  // env: LLAMA_SET_ROWS (temporary)
232
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
233
- bool supports_set_rows = false;
234
 
235
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
236
 
 
136
 
137
  // state write/load
138
 
139
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
140
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
141
 
142
  //
143
  // llama_kv_cache_unified specific API
 
230
 
231
  // env: LLAMA_SET_ROWS (temporary)
232
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
233
+ bool supports_set_rows = true;
234
 
235
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
236
 
examples/talk-llama/llama-memory-hybrid.cpp CHANGED
@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
25
  /* common */
26
  uint32_t n_seq_max,
27
  bool offload,
 
28
  /* layer filters */
29
  layer_filter_cb && filter_attn,
30
  layer_filter_cb && filter_recr) :
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
38
  type_v,
39
  v_trans,
40
  offload,
41
- 1,
42
  kv_size,
43
  n_seq_max,
44
  n_pad,
@@ -164,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
164
  return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
165
  }
166
 
167
- void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
 
 
168
  mem_attn->state_write(io, seq_id);
169
  mem_recr->state_write(io, seq_id);
170
  }
171
 
172
- void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
 
 
173
  mem_attn->state_read(io, seq_id);
174
  mem_recr->state_read(io, seq_id);
175
  }
 
25
  /* common */
26
  uint32_t n_seq_max,
27
  bool offload,
28
+ bool unified,
29
  /* layer filters */
30
  layer_filter_cb && filter_attn,
31
  layer_filter_cb && filter_recr) :
 
39
  type_v,
40
  v_trans,
41
  offload,
42
+ unified,
43
  kv_size,
44
  n_seq_max,
45
  n_pad,
 
165
  return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
166
  }
167
 
168
+ void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
169
+ GGML_UNUSED(flags);
170
+
171
  mem_attn->state_write(io, seq_id);
172
  mem_recr->state_write(io, seq_id);
173
  }
174
 
175
+ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
176
+ GGML_UNUSED(flags);
177
+
178
  mem_attn->state_read(io, seq_id);
179
  mem_recr->state_read(io, seq_id);
180
  }
examples/talk-llama/llama-memory-hybrid.h CHANGED
@@ -39,6 +39,7 @@ public:
39
  /* common */
40
  uint32_t n_seq_max,
41
  bool offload,
 
42
  /* layer filters */
43
  layer_filter_cb && filter_attn = nullptr,
44
  layer_filter_cb && filter_recr = nullptr);
@@ -73,8 +74,8 @@ public:
73
 
74
  // state write/load
75
 
76
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
77
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
78
 
79
  //
80
  // llama_memory_hybrid specific API
 
39
  /* common */
40
  uint32_t n_seq_max,
41
  bool offload,
42
+ bool unified,
43
  /* layer filters */
44
  layer_filter_cb && filter_attn = nullptr,
45
  layer_filter_cb && filter_recr = nullptr);
 
74
 
75
  // state write/load
76
 
77
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
78
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
79
 
80
  //
81
  // llama_memory_hybrid specific API
examples/talk-llama/llama-memory-recurrent.cpp CHANGED
@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
680
  return size_s_bytes;
681
  }
682
 
683
- void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
 
 
684
  std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
685
  uint32_t cell_count = 0;
686
 
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
718
  state_write_data(io, cell_ranges);
719
  }
720
 
721
- void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
 
 
722
  uint32_t cell_count;
723
  io.read_to(&cell_count, sizeof(cell_count));
724
 
 
680
  return size_s_bytes;
681
  }
682
 
683
+ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
684
+ GGML_UNUSED(flags);
685
+
686
  std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
687
  uint32_t cell_count = 0;
688
 
 
720
  state_write_data(io, cell_ranges);
721
  }
722
 
723
+ void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
724
+ GGML_UNUSED(flags);
725
+
726
  uint32_t cell_count;
727
  io.read_to(&cell_count, sizeof(cell_count));
728
 
examples/talk-llama/llama-memory-recurrent.h CHANGED
@@ -63,8 +63,8 @@ public:
63
 
64
  // state write/load
65
 
66
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
67
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
68
 
69
  uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
70
  uint32_t size = 0; // total number of cells, shared across all sequences
 
63
 
64
  // state write/load
65
 
66
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
67
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
68
 
69
  uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
70
  uint32_t size = 0; // total number of cells, shared across all sequences
examples/talk-llama/llama-memory.h CHANGED
@@ -104,8 +104,8 @@ struct llama_memory_i {
104
  // state write/read
105
  //
106
 
107
- virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
108
- virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
109
  };
110
 
111
  using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
 
104
  // state write/read
105
  //
106
 
107
+ virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
108
+ virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
109
  };
110
 
111
  using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
examples/talk-llama/llama-model-loader.cpp CHANGED
@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
35
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
 
38
  case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39
  case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
 
35
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
39
  case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
40
  case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
41
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
examples/talk-llama/llama-model-loader.h CHANGED
@@ -58,8 +58,9 @@ struct llama_model_loader {
58
  }
59
  };
60
 
61
- static const int TENSOR_NOT_REQUIRED = 1;
62
- static const int TENSOR_DUPLICATED = 2;
 
63
 
64
  int n_kv = 0;
65
  int n_tensors = 0;
 
58
  }
59
  };
60
 
61
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
62
+ static const int TENSOR_DUPLICATED = 1 << 1;
63
+ static const int TENSOR_SKIP = 1 << 2;
64
 
65
  int n_kv = 0;
66
  int n_tensors = 0;
examples/talk-llama/llama-model.cpp CHANGED
@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
109
  case LLM_TYPE_A13B: return "A13B";
110
  case LLM_TYPE_21B_A3B: return "21B.A3B";
111
  case LLM_TYPE_30B_A3B: return "30B.A3B";
 
112
  case LLM_TYPE_235B_A22B: return "235B.A22B";
113
  case LLM_TYPE_300B_A47B: return "300B.A47B";
 
114
  case LLM_TYPE_E2B: return "E2B";
115
  case LLM_TYPE_E4B: return "E4B";
116
  default: return "?B";
@@ -190,6 +192,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
190
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
191
  op_tensor = ggml_add(ctx, a, w);
192
  } break;
 
 
 
 
 
 
 
193
  case GGML_OP_MUL:
194
  {
195
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
@@ -258,6 +267,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
258
  ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
259
  op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
260
  } break;
 
 
 
 
261
  default:
262
  GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
263
  }
@@ -290,7 +303,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
290
  }
291
 
292
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
293
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
294
  buft_list_t buft_list;
295
 
296
  // add ACCEL buffer types
@@ -319,21 +332,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
319
  }
320
  }
321
 
322
- // add extra buffer types, only if no GPU device is present
323
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
324
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325
- if (cpu_dev == nullptr) {
326
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
327
- }
328
 
329
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332
- if (ggml_backend_dev_get_extra_bufts_fn) {
333
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334
- while (extra_bufts && *extra_bufts) {
335
- buft_list.emplace_back(cpu_dev, *extra_bufts);
336
- ++extra_bufts;
 
337
  }
338
  }
339
 
@@ -869,6 +883,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
869
  hparams.causal_attn = false;
870
  }
871
  break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
872
  case LLM_ARCH_QWEN2MOE:
873
  {
874
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -883,6 +912,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
883
  } break;
884
  case LLM_ARCH_QWEN3:
885
  {
 
886
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
887
  switch (hparams.n_layer) {
888
  case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
@@ -1065,6 +1095,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1065
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1066
 
1067
  switch (hparams.n_layer) {
 
1068
  case 26: type = LLM_TYPE_1B; break;
1069
  case 34: type = LLM_TYPE_4B; break;
1070
  case 48: type = LLM_TYPE_12B; break;
@@ -1417,6 +1448,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1417
  default: type = LLM_TYPE_UNKNOWN;
1418
  }
1419
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1420
  case LLM_ARCH_BITNET:
1421
  {
1422
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1744,6 +1803,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1744
  default: type = LLM_TYPE_UNKNOWN;
1745
  }
1746
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
1747
  case LLM_ARCH_SMOLLM3:
1748
  {
1749
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1754,6 +1825,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1754
  default: type = LLM_TYPE_UNKNOWN;
1755
  }
1756
  } break;
 
 
 
 
 
 
 
 
 
 
 
1757
  case LLM_ARCH_LFM2:
1758
  {
1759
  ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
@@ -1768,6 +1850,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1768
  default: type = LLM_TYPE_UNKNOWN;
1769
  }
1770
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1771
  default: throw std::runtime_error("unsupported model architecture");
1772
  }
1773
 
@@ -1801,7 +1906,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1801
  LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1802
 
1803
  // build a list of buffer types for the CPU and GPU devices
1804
- pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1805
  for (auto * dev : devices) {
1806
  buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
1807
  // add CPU buffer types as a fallback
@@ -1897,6 +2002,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1897
 
1898
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
1899
  const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
 
1900
 
1901
  // create tensors for the weights
1902
  {
@@ -1952,7 +2058,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1952
  }
1953
 
1954
  // skip unused tensors
1955
- if (info.op == GGML_OP_NONE) {
1956
  const size_t nbytes = ggml_nbytes(t_meta);
1957
  LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
1958
 
@@ -1962,11 +2068,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1962
  return nullptr;
1963
  }
1964
 
1965
- // tensors with "bias" suffix are always used with GGML_OP_ADD
1966
  ggml_op op;
1967
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
1968
  if (bias) {
1969
- op = GGML_OP_ADD;
 
 
 
 
1970
  } else {
1971
  op = info.op;
1972
  }
@@ -2006,7 +2116,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2006
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2007
  std::regex pattern(overrides->pattern);
2008
  if (std::regex_search(tensor_name, pattern)) {
2009
- buft = overrides->buft;
 
 
 
 
 
 
2010
  LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2011
  tensor_name.c_str(),
2012
  ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
@@ -2126,6 +2242,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2126
  }
2127
  }
2128
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2129
  case LLM_ARCH_LLAMA4:
2130
  {
2131
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4322,6 +4485,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4322
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4323
  }
4324
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4325
  case LLM_ARCH_NEMOTRON:
4326
  {
4327
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5103,6 +5365,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5103
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
5104
  }
5105
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5106
  case LLM_ARCH_SMOLLM3:
5107
  {
5108
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5132,6 +5427,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5132
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5133
  }
5134
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5135
  case LLM_ARCH_LFM2:
5136
  {
5137
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5165,6 +5500,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5165
  }
5166
  }
5167
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5168
  default:
5169
  throw std::runtime_error("unknown architecture");
5170
  }
@@ -5468,7 +5839,7 @@ void llama_model::print_info() const {
5468
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5469
  }
5470
 
5471
- if (arch == LLM_ARCH_QWEN3MOE) {
5472
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5473
  }
5474
 
@@ -5490,6 +5861,11 @@ void llama_model::print_info() const {
5490
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
5491
  }
5492
 
 
 
 
 
 
5493
  vocab.print_info();
5494
  }
5495
 
@@ -7978,8 +8354,10 @@ struct llm_build_dream : public llm_graph_context {
7978
  }
7979
  };
7980
 
7981
- struct llm_build_qwen2vl : public llm_graph_context {
7982
- llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
 
 
7983
  const int64_t n_embd_head = hparams.n_embd_head_v;
7984
 
7985
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7993,10 +8371,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
7993
  // inp_pos - contains the positions
7994
  ggml_tensor * inp_pos = build_inp_pos();
7995
 
7996
- auto * inp_attn = build_attn_inp_kv_unified();
7997
-
7998
- int sections[4];
7999
- std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
8000
 
8001
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8002
 
@@ -8004,34 +8380,134 @@ struct llm_build_qwen2vl : public llm_graph_context {
8004
  ggml_tensor * inpSA = inpL;
8005
 
8006
  // norm
8007
- cur = build_norm(inpL,
8008
- model.layers[il].attn_norm, NULL,
8009
- LLM_NORM_RMS, il);
8010
  cb(cur, "attn_norm", il);
8011
 
8012
  // self-attention
8013
  {
8014
- // compute Q and K and RoPE them
8015
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8016
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8017
- cb(Qcur, "Qcur", il);
8018
-
8019
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8020
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8021
- cb(Kcur, "Kcur", il);
8022
-
8023
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8024
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
 
 
8025
  cb(Vcur, "Vcur", il);
8026
 
8027
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8028
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8029
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8030
 
8031
- Qcur = ggml_rope_multi(
8032
- ctx0, Qcur, inp_pos, nullptr,
8033
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8034
- ext_factor, attn_factor, beta_fast, beta_slow
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8035
  );
8036
 
8037
  Kcur = ggml_rope_multi(
@@ -13285,6 +13761,165 @@ struct llm_build_glm4 : public llm_graph_context {
13285
  }
13286
  };
13287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13288
  struct llm_build_nemotron : public llm_graph_context {
13289
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13290
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -16697,8 +17332,8 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
16697
  }
16698
  };
16699
 
16700
- struct llm_build_smollm3 : public llm_graph_context {
16701
- llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
16702
  const int64_t n_embd_head = hparams.n_embd_head_v;
16703
 
16704
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -16714,23 +17349,23 @@ struct llm_build_smollm3 : public llm_graph_context {
16714
 
16715
  auto * inp_attn = build_attn_inp_kv_unified();
16716
 
16717
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
16718
 
16719
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16720
 
16721
  for (int il = 0; il < n_layer; ++il) {
16722
  ggml_tensor * inpSA = inpL;
16723
 
16724
- const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
16725
-
16726
  // norm
16727
  cur = build_norm(inpL,
16728
  model.layers[il].attn_norm, NULL,
16729
  LLM_NORM_RMS, il);
16730
  cb(cur, "attn_norm", il);
16731
-
16732
  // self-attention
16733
  {
 
 
 
16734
  // compute Q and K and RoPE them
16735
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
16736
  cb(Qcur, "Qcur", il);
@@ -16757,10 +17392,148 @@ struct llm_build_smollm3 : public llm_graph_context {
16757
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
16758
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
16759
 
16760
- if (use_rope) {
16761
- Qcur = ggml_rope_ext(
16762
- ctx0, Qcur, inp_pos, nullptr,
16763
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16764
  ext_factor, attn_factor, beta_fast, beta_slow
16765
  );
16766
 
@@ -16834,6 +17607,136 @@ struct llm_build_smollm3 : public llm_graph_context {
16834
  }
16835
  };
16836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16837
  struct llm_build_lfm2 : public llm_graph_context {
16838
  const llama_model & model;
16839
 
@@ -17011,6 +17914,127 @@ struct llm_build_lfm2 : public llm_graph_context {
17011
  }
17012
  };
17013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17014
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
17015
  llama_memory_i * res;
17016
 
@@ -17024,6 +18048,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17024
  case LLM_ARCH_NEO_BERT:
17025
  case LLM_ARCH_WAVTOKENIZER_DEC:
17026
  case LLM_ARCH_DREAM:
 
17027
  {
17028
  res = nullptr;
17029
  } break;
@@ -17059,6 +18084,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17059
  /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
17060
  /* n_seq_max */ cparams.n_seq_max,
17061
  /* offload */ cparams.offload_kqv,
 
17062
  /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
17063
  /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
17064
  } else {
@@ -17190,6 +18216,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17190
  llm = std::make_unique<llm_build_dream>(*this, params);
17191
  }
17192
  break;
 
 
 
 
 
17193
  case LLM_ARCH_QWEN2VL:
17194
  {
17195
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -17332,6 +18363,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17332
  {
17333
  llm = std::make_unique<llm_build_glm4>(*this, params);
17334
  } break;
 
 
 
 
17335
  case LLM_ARCH_BITNET:
17336
  {
17337
  llm = std::make_unique<llm_build_bitnet>(*this, params);
@@ -17437,10 +18472,18 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17437
  {
17438
  llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
17439
  } break;
 
 
 
 
17440
  case LLM_ARCH_SMOLLM3:
17441
  {
17442
  llm = std::make_unique<llm_build_smollm3>(*this, params);
17443
  } break;
 
 
 
 
17444
  case LLM_ARCH_FALCON_H1:
17445
  {
17446
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
@@ -17449,6 +18492,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17449
  {
17450
  llm = std::make_unique<llm_build_lfm2>(*this, params);
17451
  } break;
 
 
 
 
 
 
 
 
17452
  default:
17453
  GGML_ABORT("fatal error");
17454
  }
@@ -17478,6 +18529,7 @@ llama_model_params llama_model_default_params() {
17478
  /*.use_mmap =*/ true,
17479
  /*.use_mlock =*/ false,
17480
  /*.check_tensors =*/ false,
 
17481
  };
17482
 
17483
  #ifdef GGML_USE_METAL
@@ -17580,6 +18632,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17580
 
17581
  // use what we call a normal RoPE, operating on pairs of consecutive head values
17582
  case LLM_ARCH_LLAMA:
 
17583
  case LLM_ARCH_LLAMA4:
17584
  case LLM_ARCH_DECI:
17585
  case LLM_ARCH_BAICHUAN:
@@ -17646,7 +18699,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17646
  case LLM_ARCH_MINICPM3:
17647
  case LLM_ARCH_DOTS1:
17648
  case LLM_ARCH_HUNYUAN_MOE:
 
 
17649
  case LLM_ARCH_LFM2:
 
 
17650
  return LLAMA_ROPE_TYPE_NEOX;
17651
 
17652
  case LLM_ARCH_QWEN2VL:
@@ -17757,6 +18814,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
17757
  return llm_arch_is_recurrent(model->arch);
17758
  }
17759
 
 
 
 
 
17760
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
17761
  return model->tensors_by_name;
17762
  }
 
109
  case LLM_TYPE_A13B: return "A13B";
110
  case LLM_TYPE_21B_A3B: return "21B.A3B";
111
  case LLM_TYPE_30B_A3B: return "30B.A3B";
112
+ case LLM_TYPE_106B_A12B: return "106B.A12B";
113
  case LLM_TYPE_235B_A22B: return "235B.A22B";
114
  case LLM_TYPE_300B_A47B: return "300B.A47B";
115
+ case LLM_TYPE_355B_A32B: return "355B.A32B";
116
  case LLM_TYPE_E2B: return "E2B";
117
  case LLM_TYPE_E4B: return "E4B";
118
  default: return "?B";
 
192
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
193
  op_tensor = ggml_add(ctx, a, w);
194
  } break;
195
+ case GGML_OP_ADD_ID:
196
+ {
197
+ int n_expert_used = hparams.n_expert_used;
198
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
199
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
200
+ op_tensor = ggml_add_id(ctx, a, w, c);
201
+ } break;
202
  case GGML_OP_MUL:
203
  {
204
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
 
267
  ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
268
  op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
269
  } break;
270
+ case GGML_OP_SCALE:
271
+ {
272
+ op_tensor = ggml_scale(ctx, w, 1.0f);
273
+ } break;
274
  default:
275
  GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
276
  }
 
303
  }
304
 
305
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
306
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
307
  buft_list_t buft_list;
308
 
309
  // add ACCEL buffer types
 
332
  }
333
  }
334
 
335
+ // add extra buffer types
336
+ if (use_extra_bufts) {
337
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
338
+ if (cpu_dev == nullptr) {
339
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
340
+ }
341
 
342
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
343
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
344
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
345
+ if (ggml_backend_dev_get_extra_bufts_fn) {
346
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
347
+ while (extra_bufts && *extra_bufts) {
348
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
349
+ ++extra_bufts;
350
+ }
351
  }
352
  }
353
 
 
883
  hparams.causal_attn = false;
884
  }
885
  break;
886
+ case LLM_ARCH_LLADA:
887
+ {
888
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
889
+ // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
890
+ switch (hparams.n_layer) {
891
+ case 32:
892
+ type = LLM_TYPE_8B;
893
+ break;
894
+ default:
895
+ type = LLM_TYPE_UNKNOWN;
896
+ }
897
+ // Set non-causal attention for diffusion models
898
+ hparams.causal_attn = false;
899
+ }
900
+ break;
901
  case LLM_ARCH_QWEN2MOE:
902
  {
903
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
 
912
  } break;
913
  case LLM_ARCH_QWEN3:
914
  {
915
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
916
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
917
  switch (hparams.n_layer) {
918
  case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
 
1095
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1096
 
1097
  switch (hparams.n_layer) {
1098
+ case 18: type = LLM_TYPE_537M; break;
1099
  case 26: type = LLM_TYPE_1B; break;
1100
  case 34: type = LLM_TYPE_4B; break;
1101
  case 48: type = LLM_TYPE_12B; break;
 
1448
  default: type = LLM_TYPE_UNKNOWN;
1449
  }
1450
  } break;
1451
+ case LLM_ARCH_GLM4_MOE:
1452
+ {
1453
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1454
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1455
+
1456
+ // MoE parameters
1457
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
1458
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1459
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1460
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1461
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1462
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1463
+
1464
+ // Expert gating function (GLM-4.5 uses sigmoid)
1465
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1466
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1467
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1468
+ }
1469
+
1470
+ // NextN/MTP parameters
1471
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1472
+
1473
+ switch (hparams.n_layer) {
1474
+ case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1475
+ case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1476
+ default: type = LLM_TYPE_UNKNOWN;
1477
+ }
1478
+ } break;
1479
  case LLM_ARCH_BITNET:
1480
  {
1481
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
1803
  default: type = LLM_TYPE_UNKNOWN;
1804
  }
1805
  } break;
1806
+ case LLM_ARCH_HUNYUAN_DENSE:
1807
+ {
1808
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1809
+
1810
+ switch (hparams.n_embd) {
1811
+ case 1024: type = LLM_TYPE_0_5B; break;
1812
+ case 2048: type = LLM_TYPE_1_8B; break;
1813
+ case 3072: type = LLM_TYPE_4B; break;
1814
+ case 4096: type = LLM_TYPE_7B; break;
1815
+ default: type = LLM_TYPE_UNKNOWN;
1816
+ }
1817
+ } break;
1818
  case LLM_ARCH_SMOLLM3:
1819
  {
1820
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
1825
  default: type = LLM_TYPE_UNKNOWN;
1826
  }
1827
  } break;
1828
+ case LLM_ARCH_OPENAI_MOE:
1829
+ {
1830
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1831
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1832
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1833
+
1834
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1835
+ hparams.set_swa_pattern(2);
1836
+
1837
+ // TODO: switch (hparams.n_layer)
1838
+ } break;
1839
  case LLM_ARCH_LFM2:
1840
  {
1841
  ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
 
1850
  default: type = LLM_TYPE_UNKNOWN;
1851
  }
1852
  } break;
1853
+ case LLM_ARCH_SMALLTHINKER:
1854
+ {
1855
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1856
+
1857
+ if (found_swa && hparams.n_swa > 0) {
1858
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1859
+ hparams.n_swa = 4096;
1860
+ hparams.set_swa_pattern(4, true);
1861
+ } else {
1862
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1863
+ hparams.n_no_rope_layer_step = hparams.n_layer;
1864
+ }
1865
+
1866
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1867
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1868
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1869
+
1870
+ switch (hparams.n_layer) {
1871
+ case 32: type = LLM_TYPE_4B; break;
1872
+ case 52: type = LLM_TYPE_20B; break;
1873
+ default: type = LLM_TYPE_UNKNOWN;
1874
+ }
1875
+ } break;
1876
  default: throw std::runtime_error("unsupported model architecture");
1877
  }
1878
 
 
1906
  LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1907
 
1908
  // build a list of buffer types for the CPU and GPU devices
1909
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
1910
  for (auto * dev : devices) {
1911
  buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
1912
  // add CPU buffer types as a fallback
 
2002
 
2003
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
2004
  const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2005
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
2006
 
2007
  // create tensors for the weights
2008
  {
 
2058
  }
2059
 
2060
  // skip unused tensors
2061
+ if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2062
  const size_t nbytes = ggml_nbytes(t_meta);
2063
  LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2064
 
 
2068
  return nullptr;
2069
  }
2070
 
2071
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2072
  ggml_op op;
2073
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2074
  if (bias) {
2075
+ if (info.op == GGML_OP_MUL_MAT_ID) {
2076
+ op = GGML_OP_ADD_ID;
2077
+ } else {
2078
+ op = GGML_OP_ADD;
2079
+ }
2080
  } else {
2081
  op = info.op;
2082
  }
 
2116
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2117
  std::regex pattern(overrides->pattern);
2118
  if (std::regex_search(tensor_name, pattern)) {
2119
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2120
+ // when overriding to a CPU buffer, consider the extra buffer types
2121
+ buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2122
+ } else {
2123
+ buft = overrides->buft;
2124
+ }
2125
+
2126
  LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2127
  tensor_name.c_str(),
2128
  ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
 
2242
  }
2243
  }
2244
  } break;
2245
+ case LLM_ARCH_LLADA:
2246
+ {
2247
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2248
+
2249
+ // output
2250
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2251
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2252
+
2253
+ // if output is NULL, init from the input tok embed
2254
+ if (output == NULL) {
2255
+ output =
2256
+ create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2257
+ }
2258
+
2259
+ for (int i = 0; i < n_layer; ++i) {
2260
+ auto & layer = layers[i];
2261
+
2262
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2263
+
2264
+ // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2265
+ layer.wq =
2266
+ create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2267
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2268
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2269
+ // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2270
+ layer.wo =
2271
+ create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2272
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2273
+
2274
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2275
+
2276
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2277
+ TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2278
+
2279
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2280
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2281
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2282
+
2283
+ // optional MLP bias
2284
+ layer.ffn_gate_b =
2285
+ create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2286
+ layer.ffn_down_b =
2287
+ create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2288
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2289
+ }
2290
+ }
2291
+ break;
2292
  case LLM_ARCH_LLAMA4:
2293
  {
2294
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
4485
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4486
  }
4487
  } break;
4488
+ case LLM_ARCH_GLM4_MOE:
4489
+ {
4490
+ const int64_t n_expert = hparams.n_expert;
4491
+ const int64_t n_expert_used = hparams.n_expert_used;
4492
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4493
+
4494
+ GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
4495
+ GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
4496
+
4497
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4498
+
4499
+ // output
4500
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4501
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4502
+ // if output is NULL, init from the input tok embed
4503
+ if (output == NULL) {
4504
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
4505
+ }
4506
+
4507
+ // Load ALL tensors including NextN layer to satisfy total tensor count
4508
+ // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
4509
+ for (int i = 0; i < n_layer; ++i) {
4510
+ int flags = 0;
4511
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4512
+ // skip all tensors in the NextN layers
4513
+ flags |= TENSOR_SKIP;
4514
+ }
4515
+
4516
+ auto & layer = layers[i];
4517
+
4518
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
4519
+
4520
+ // GLM-style attention with bias terms
4521
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
4522
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
4523
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
4524
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
4525
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
4526
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
4527
+
4528
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
4529
+
4530
+ // K/Q norm tensors (optional for GLM-4.5 355B variant)
4531
+ layer.attn_q_norm = create_tensor(
4532
+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4533
+ layer.attn_k_norm = create_tensor(
4534
+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4535
+
4536
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
4537
+
4538
+ // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
4539
+ // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
4540
+ const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
4541
+
4542
+ if (use_moe) {
4543
+ // MoE layers
4544
+ layer.ffn_gate_inp =
4545
+ create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
4546
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
4547
+
4548
+ // MoE branch
4549
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
4550
+
4551
+ layer.ffn_gate_exps = create_tensor(
4552
+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4553
+ layer.ffn_down_exps = create_tensor(
4554
+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
4555
+ layer.ffn_up_exps = create_tensor(
4556
+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4557
+
4558
+ // Shared expert
4559
+ if (n_expert_shared > 0) {
4560
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
4561
+ layer.ffn_gate_shexp = create_tensor(
4562
+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4563
+ layer.ffn_down_shexp = create_tensor(
4564
+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
4565
+ layer.ffn_up_shexp = create_tensor(
4566
+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4567
+ }
4568
+ } else {
4569
+ // Dense layers (first k layers) - GLM uses separate gate/up projections
4570
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
4571
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
4572
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
4573
+ }
4574
+
4575
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4576
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4577
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4578
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
4579
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4580
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4581
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
4582
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4583
+ }
4584
+ }
4585
+ }
4586
+ break;
4587
  case LLM_ARCH_NEMOTRON:
4588
  {
4589
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
5365
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
5366
  }
5367
  } break;
5368
+ case LLM_ARCH_HUNYUAN_DENSE:
5369
+ {
5370
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5371
+
5372
+ // output
5373
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5374
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5375
+ // if output is NULL, init from the input tok embed
5376
+ if (output == NULL) {
5377
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5378
+ }
5379
+
5380
+ for (int i = 0; i < n_layer; ++i) {
5381
+ auto & layer = layers[i];
5382
+
5383
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5384
+
5385
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5386
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5387
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5388
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5389
+
5390
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5391
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5392
+
5393
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5394
+
5395
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5396
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5397
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5398
+
5399
+ }
5400
+ } break;
5401
  case LLM_ARCH_SMOLLM3:
5402
  {
5403
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
5427
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5428
  }
5429
  } break;
5430
+ case LLM_ARCH_OPENAI_MOE:
5431
+ {
5432
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5433
+
5434
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5435
+
5436
+ // output
5437
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5438
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5439
+
5440
+ for (int i = 0; i < n_layer; ++i) {
5441
+ auto & layer = layers[i];
5442
+
5443
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5444
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5445
+
5446
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
5447
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5448
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5449
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5450
+
5451
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
5452
+
5453
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5454
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5455
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5456
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5457
+
5458
+ // bias
5459
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
5460
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
5461
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
5462
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5463
+
5464
+ layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
5465
+ layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5466
+ layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
5467
+ layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5468
+ }
5469
+ } break;
5470
  case LLM_ARCH_LFM2:
5471
  {
5472
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
5500
  }
5501
  }
5502
  } break;
5503
+ case LLM_ARCH_SMALLTHINKER:
5504
+ {
5505
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5506
+
5507
+ // output
5508
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5509
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5510
+
5511
+ // if output is NULL, init from the input tok embed
5512
+ if (output == NULL) {
5513
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5514
+ }
5515
+
5516
+ for (int i = 0; i < n_layer; ++i) {
5517
+ auto & layer = layers[i];
5518
+
5519
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
5520
+
5521
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
5522
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
5523
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
5524
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
5525
+
5526
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
5527
+
5528
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
5529
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
5530
+
5531
+ // MoE branch
5532
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5533
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
5534
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5535
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
5536
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5537
+ }
5538
+ } break;
5539
  default:
5540
  throw std::runtime_error("unknown architecture");
5541
  }
 
5839
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5840
  }
5841
 
5842
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
5843
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5844
  }
5845
 
 
5861
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
5862
  }
5863
 
5864
+ if (arch == LLM_ARCH_SMALLTHINKER) {
5865
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5866
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
5867
+ }
5868
+
5869
  vocab.print_info();
5870
  }
5871
 
 
8354
  }
8355
  };
8356
 
8357
+ struct llm_build_llada : public llm_graph_context {
8358
+ llm_build_llada(const llama_model & model, const llm_graph_params & params) :
8359
+ llm_graph_context(params) {
8360
+ // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
8361
  const int64_t n_embd_head = hparams.n_embd_head_v;
8362
 
8363
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
8371
  // inp_pos - contains the positions
8372
  ggml_tensor * inp_pos = build_inp_pos();
8373
 
8374
+ // Non-causal attention for diffusion
8375
+ auto * inp_attn = build_attn_inp_no_cache();
 
 
8376
 
8377
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8378
 
 
8380
  ggml_tensor * inpSA = inpL;
8381
 
8382
  // norm
8383
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
 
 
8384
  cb(cur, "attn_norm", il);
8385
 
8386
  // self-attention
8387
  {
8388
+ // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
8389
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
 
 
 
8390
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
 
 
 
8391
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8392
+
8393
+ cb(Qcur, "Qcur", il);
8394
+ cb(Kcur, "Kcur", il);
8395
  cb(Vcur, "Vcur", il);
8396
 
8397
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8398
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8399
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8400
 
8401
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8402
+ ext_factor, attn_factor, beta_fast, beta_slow);
8403
+
8404
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8405
+ ext_factor, attn_factor, beta_fast, beta_slow);
8406
+
8407
+ cb(Qcur, "Qcur", il);
8408
+ cb(Kcur, "Kcur", il);
8409
+ cb(Vcur, "Vcur", il);
8410
+
8411
+ cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
8412
+ 1.0f / sqrtf(float(n_embd_head)), il);
8413
+ }
8414
+
8415
+ if (il == n_layer - 1 && inp_out_ids) {
8416
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8417
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8418
+ }
8419
+
8420
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8421
+ cb(ffn_inp, "ffn_inp", il);
8422
+
8423
+ // feed-forward network
8424
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
8425
+ cb(cur, "ffn_norm", il);
8426
+
8427
+ cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
8428
+ model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
8429
+ cb(cur, "ffn_out", il);
8430
+
8431
+ cur = ggml_add(ctx0, cur, ffn_inp);
8432
+
8433
+ cur = build_cvec(cur, il);
8434
+ cb(cur, "l_out", il);
8435
+
8436
+ // input for next layer
8437
+ inpL = cur;
8438
+ }
8439
+
8440
+ cur = inpL;
8441
+
8442
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
8443
+
8444
+ cb(cur, "result_norm", -1);
8445
+ res->t_embd = cur;
8446
+
8447
+ // lm_head
8448
+ cur = build_lora_mm(model.output, cur);
8449
+
8450
+ cb(cur, "result_output", -1);
8451
+ res->t_logits = cur;
8452
+
8453
+ ggml_build_forward_expand(gf, cur);
8454
+ }
8455
+ };
8456
+
8457
+ struct llm_build_qwen2vl : public llm_graph_context {
8458
+ llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8459
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8460
+
8461
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8462
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8463
+
8464
+ ggml_tensor * cur;
8465
+ ggml_tensor * inpL;
8466
+
8467
+ inpL = build_inp_embd(model.tok_embd);
8468
+
8469
+ // inp_pos - contains the positions
8470
+ ggml_tensor * inp_pos = build_inp_pos();
8471
+
8472
+ auto * inp_attn = build_attn_inp_kv_unified();
8473
+
8474
+ int sections[4];
8475
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
8476
+
8477
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8478
+
8479
+ for (int il = 0; il < n_layer; ++il) {
8480
+ ggml_tensor * inpSA = inpL;
8481
+
8482
+ // norm
8483
+ cur = build_norm(inpL,
8484
+ model.layers[il].attn_norm, NULL,
8485
+ LLM_NORM_RMS, il);
8486
+ cb(cur, "attn_norm", il);
8487
+
8488
+ // self-attention
8489
+ {
8490
+ // compute Q and K and RoPE them
8491
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8492
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8493
+ cb(Qcur, "Qcur", il);
8494
+
8495
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8496
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8497
+ cb(Kcur, "Kcur", il);
8498
+
8499
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8500
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8501
+ cb(Vcur, "Vcur", il);
8502
+
8503
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8504
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8505
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8506
+
8507
+ Qcur = ggml_rope_multi(
8508
+ ctx0, Qcur, inp_pos, nullptr,
8509
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8510
+ ext_factor, attn_factor, beta_fast, beta_slow
8511
  );
8512
 
8513
  Kcur = ggml_rope_multi(
 
13761
  }
13762
  };
13763
 
13764
+ struct llm_build_glm4_moe : public llm_graph_context {
13765
+ llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13766
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13767
+
13768
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13769
+
13770
+ ggml_tensor * cur;
13771
+ ggml_tensor * inpL;
13772
+
13773
+ inpL = build_inp_embd(model.tok_embd);
13774
+
13775
+ // inp_pos - contains the positions
13776
+ ggml_tensor * inp_pos = build_inp_pos();
13777
+
13778
+ auto * inp_attn = build_attn_inp_kv_unified();
13779
+
13780
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13781
+
13782
+ // Only process up to last layer (skip final NextN layer)
13783
+ // Final layer tensors are loaded but not processed in forward pass
13784
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
13785
+ for (int il = 0; il < n_transformer_layers; ++il) {
13786
+ ggml_tensor * inpSA = inpL;
13787
+
13788
+ // Pre-attention norm
13789
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
13790
+ cb(cur, "attn_norm", il);
13791
+
13792
+ // self-attention
13793
+ {
13794
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13795
+ if (model.layers[il].bq) {
13796
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13797
+ }
13798
+ cb(Qcur, "Qcur", il);
13799
+
13800
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13801
+ if (model.layers[il].bk) {
13802
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13803
+ }
13804
+ cb(Kcur, "Kcur", il);
13805
+
13806
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13807
+ if (model.layers[il].bv) {
13808
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13809
+ }
13810
+ cb(Vcur, "Vcur", il);
13811
+
13812
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13813
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13814
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13815
+
13816
+ // Apply Q/K norm if available (GLM-4.5 355B variant)
13817
+ if (model.layers[il].attn_q_norm) {
13818
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13819
+ cb(Qcur, "Qcur_normed", il);
13820
+ }
13821
+ if (model.layers[il].attn_k_norm) {
13822
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13823
+ cb(Kcur, "Kcur_normed", il);
13824
+ }
13825
+
13826
+ Qcur = ggml_rope_ext(
13827
+ ctx0, Qcur, inp_pos, nullptr,
13828
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13829
+ ext_factor, attn_factor, beta_fast, beta_slow
13830
+ );
13831
+
13832
+ Kcur = ggml_rope_ext(
13833
+ ctx0, Kcur, inp_pos, nullptr,
13834
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13835
+ ext_factor, attn_factor, beta_fast, beta_slow
13836
+ );
13837
+
13838
+ cb(Qcur, "Qcur", il);
13839
+ cb(Kcur, "Kcur", il);
13840
+ cb(Vcur, "Vcur", il);
13841
+
13842
+ cur = build_attn(inp_attn,
13843
+ model.layers[il].wo, NULL,
13844
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13845
+ }
13846
+
13847
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
13848
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13849
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13850
+ }
13851
+
13852
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13853
+ cb(ffn_inp, "ffn_inp", il);
13854
+
13855
+ // Post-attention norm
13856
+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
13857
+ cb(cur, "post_attn_norm", il);
13858
+
13859
+ // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
13860
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
13861
+ // Dense FFN layer
13862
+ cur = build_ffn(cur,
13863
+ model.layers[il].ffn_up, NULL, NULL,
13864
+ model.layers[il].ffn_gate, NULL, NULL,
13865
+ model.layers[il].ffn_down, NULL, NULL,
13866
+ NULL,
13867
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13868
+ cb(cur, "ffn_out", il);
13869
+ } else {
13870
+ // Process routed experts using existing MoE infrastructure
13871
+ ggml_tensor * routed_out = build_moe_ffn(cur,
13872
+ model.layers[il].ffn_gate_inp,
13873
+ model.layers[il].ffn_up_exps,
13874
+ model.layers[il].ffn_gate_exps,
13875
+ model.layers[il].ffn_down_exps,
13876
+ model.layers[il].ffn_exp_probs_b,
13877
+ n_expert, n_expert_used,
13878
+ LLM_FFN_SILU, hparams.expert_weights_norm,
13879
+ true, hparams.expert_weights_scale,
13880
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13881
+ il);
13882
+ cb(routed_out, "ffn_moe_out", il);
13883
+
13884
+ // Process shared expert on original input
13885
+ ggml_tensor * shared_out = build_ffn(cur,
13886
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13887
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13888
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13889
+ NULL,
13890
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13891
+ cb(shared_out, "ffn_shexp_out", il);
13892
+
13893
+ // Final output: routed_output + shared_output
13894
+ cur = ggml_add(ctx0, routed_out, shared_out);
13895
+ cb(cur, "ffn_out", il);
13896
+ }
13897
+
13898
+ cur = ggml_add(ctx0, cur, ffn_inp);
13899
+
13900
+ cur = build_cvec(cur, il);
13901
+ cb(cur, "l_out", il);
13902
+
13903
+ // input for next layer
13904
+ inpL = cur;
13905
+ }
13906
+
13907
+ cur = inpL;
13908
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
13909
+
13910
+ cb(cur, "result_norm", -1);
13911
+ res->t_embd = cur;
13912
+
13913
+ // lm_head
13914
+ cur = build_lora_mm(model.output, cur);
13915
+
13916
+ cb(cur, "result_output", -1);
13917
+ res->t_logits = cur;
13918
+
13919
+ ggml_build_forward_expand(gf, cur);
13920
+ }
13921
+ };
13922
+
13923
  struct llm_build_nemotron : public llm_graph_context {
13924
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13925
  const int64_t n_embd_head = hparams.n_embd_head_v;
 
17332
  }
17333
  };
17334
 
17335
+ struct llm_build_hunyuan_dense : public llm_graph_context {
17336
+ llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17337
  const int64_t n_embd_head = hparams.n_embd_head_v;
17338
 
17339
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
17349
 
17350
  auto * inp_attn = build_attn_inp_kv_unified();
17351
 
17352
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17353
 
17354
  ggml_tensor * inp_out_ids = build_inp_out_ids();
17355
 
17356
  for (int il = 0; il < n_layer; ++il) {
17357
  ggml_tensor * inpSA = inpL;
17358
 
 
 
17359
  // norm
17360
  cur = build_norm(inpL,
17361
  model.layers[il].attn_norm, NULL,
17362
  LLM_NORM_RMS, il);
17363
  cb(cur, "attn_norm", il);
 
17364
  // self-attention
17365
  {
17366
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
17367
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
17368
+
17369
  // compute Q and K and RoPE them
17370
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17371
  cb(Qcur, "Qcur", il);
 
17392
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
17393
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
17394
 
17395
+ Qcur = ggml_rope_ext(
17396
+ ctx0, Qcur, inp_pos, rope_factors,
17397
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17398
+ ext_factor, attn_factor, beta_fast, beta_slow
17399
+ );
17400
+
17401
+ cb(Qcur, "Qcur", il);
17402
+ cb(Kcur, "Kcur", il);
17403
+ cb(Vcur, "Vcur", il);
17404
+
17405
+ Kcur = ggml_rope_ext(
17406
+ ctx0, Kcur, inp_pos, rope_factors,
17407
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17408
+ ext_factor, attn_factor, beta_fast, beta_slow
17409
+ );
17410
+
17411
+ Kcur = build_norm(Kcur,
17412
+ model.layers[il].attn_k_norm, nullptr,
17413
+ LLM_NORM_RMS, il);
17414
+ cb(Kcur, "Kcur_norm", il);
17415
+
17416
+ Qcur = build_norm(Qcur,
17417
+ model.layers[il].attn_q_norm, nullptr,
17418
+ LLM_NORM_RMS, il);
17419
+ cb(Qcur, "Qcur_norm", il);
17420
+
17421
+ cur = build_attn(inp_attn,
17422
+ model.layers[il].wo, model.layers[il].bo,
17423
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17424
+ cb(cur, "attn_out", il);
17425
+ }
17426
+
17427
+ if (il == n_layer - 1 && inp_out_ids) {
17428
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17429
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17430
+ }
17431
+
17432
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17433
+ cb(ffn_inp, "ffn_inp", il);
17434
+
17435
+ cur = build_norm(ffn_inp,
17436
+ model.layers[il].ffn_norm, NULL,
17437
+ LLM_NORM_RMS, il);
17438
+ cb(cur, "ffn_norm", il);
17439
+ // feed-forward network (non-MoE)
17440
+ ggml_tensor * cur_mlp = build_ffn(cur,
17441
+ model.layers[il].ffn_up, NULL, NULL,
17442
+ model.layers[il].ffn_gate, NULL, NULL,
17443
+ model.layers[il].ffn_down, NULL, NULL,
17444
+ NULL,
17445
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17446
+ cb(cur_mlp, "ffn_out", il);
17447
+
17448
+ cur = ggml_add(ctx0, cur_mlp, ffn_inp);
17449
+
17450
+ cur = build_cvec(cur, il);
17451
+ cb(cur, "l_out", il);
17452
+
17453
+ // input for next layer
17454
+ inpL = cur;
17455
+ }
17456
+ cur = inpL;
17457
+
17458
+ cur = build_norm(cur,
17459
+ model.output_norm, NULL,
17460
+ LLM_NORM_RMS, -1);
17461
+
17462
+ cb(cur, "result_norm", -1);
17463
+ res->t_embd = cur;
17464
+ // lm_head
17465
+ cur = build_lora_mm(model.output, cur);
17466
+ cb(cur, "result_output", -1);
17467
+ res->t_logits = cur;
17468
+
17469
+ ggml_build_forward_expand(gf, cur);
17470
+ }
17471
+ };
17472
+
17473
+ struct llm_build_smollm3 : public llm_graph_context {
17474
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17475
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17476
+
17477
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17478
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17479
+
17480
+ ggml_tensor * cur;
17481
+ ggml_tensor * inpL;
17482
+
17483
+ inpL = build_inp_embd(model.tok_embd);
17484
+
17485
+ // inp_pos - contains the positions
17486
+ ggml_tensor * inp_pos = build_inp_pos();
17487
+
17488
+ auto * inp_attn = build_attn_inp_kv_unified();
17489
+
17490
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17491
+
17492
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17493
+
17494
+ for (int il = 0; il < n_layer; ++il) {
17495
+ ggml_tensor * inpSA = inpL;
17496
+
17497
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
17498
+
17499
+ // norm
17500
+ cur = build_norm(inpL,
17501
+ model.layers[il].attn_norm, NULL,
17502
+ LLM_NORM_RMS, il);
17503
+ cb(cur, "attn_norm", il);
17504
+
17505
+ // self-attention
17506
+ {
17507
+ // compute Q and K and RoPE them
17508
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17509
+ cb(Qcur, "Qcur", il);
17510
+ if (model.layers[il].bq) {
17511
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17512
+ cb(Qcur, "Qcur", il);
17513
+ }
17514
+
17515
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17516
+ cb(Kcur, "Kcur", il);
17517
+ if (model.layers[il].bk) {
17518
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17519
+ cb(Kcur, "Kcur", il);
17520
+ }
17521
+
17522
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17523
+ cb(Vcur, "Vcur", il);
17524
+ if (model.layers[il].bv) {
17525
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17526
+ cb(Vcur, "Vcur", il);
17527
+ }
17528
+
17529
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
17530
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
17531
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
17532
+
17533
+ if (use_rope) {
17534
+ Qcur = ggml_rope_ext(
17535
+ ctx0, Qcur, inp_pos, nullptr,
17536
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17537
  ext_factor, attn_factor, beta_fast, beta_slow
17538
  );
17539
 
 
17607
  }
17608
  };
17609
 
17610
+ struct llm_build_openai_moe_iswa : public llm_graph_context {
17611
+ llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17612
+ ggml_tensor * cur;
17613
+ ggml_tensor * inpL;
17614
+
17615
+ inpL = build_inp_embd(model.tok_embd);
17616
+
17617
+ // inp_pos - contains the positions
17618
+ ggml_tensor * inp_pos = build_inp_pos();
17619
+
17620
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
17621
+
17622
+ for (int il = 0; il < n_layer; ++il) {
17623
+ ggml_tensor * inpSA = inpL;
17624
+
17625
+ // norm
17626
+ cur = build_norm(inpL,
17627
+ model.layers[il].attn_norm, nullptr,
17628
+ LLM_NORM_RMS, il);
17629
+ cb(cur, "attn_norm", il);
17630
+
17631
+ // self-attention
17632
+ {
17633
+ // compute Q and K and RoPE them
17634
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17635
+ cb(Qcur, "Qcur", il);
17636
+ if (model.layers[il].bq) {
17637
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17638
+ cb(Qcur, "Qcur", il);
17639
+ }
17640
+
17641
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17642
+ cb(Kcur, "Kcur", il);
17643
+ if (model.layers[il].bk) {
17644
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17645
+ cb(Kcur, "Kcur", il);
17646
+ }
17647
+
17648
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17649
+ cb(Vcur, "Vcur", il);
17650
+ if (model.layers[il].bv) {
17651
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17652
+ cb(Vcur, "Vcur", il);
17653
+ }
17654
+
17655
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
17656
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
17657
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
17658
+
17659
+ Qcur = ggml_rope_ext(
17660
+ ctx0, Qcur, inp_pos, nullptr,
17661
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17662
+ ext_factor, attn_factor, beta_fast, beta_slow
17663
+ );
17664
+
17665
+ Kcur = ggml_rope_ext(
17666
+ ctx0, Kcur, inp_pos, nullptr,
17667
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17668
+ ext_factor, attn_factor, beta_fast, beta_slow
17669
+ );
17670
+
17671
+ cb(Qcur, "Qcur", il);
17672
+ cb(Kcur, "Kcur", il);
17673
+ cb(Vcur, "Vcur", il);
17674
+
17675
+ cur = build_attn_with_sinks(inp_attn,
17676
+ model.layers[il].wo, model.layers[il].bo,
17677
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
17678
+
17679
+ cb(cur, "attn_out", il);
17680
+ }
17681
+
17682
+ if (il == n_layer - 1) {
17683
+ // skip computing output for unused tokens
17684
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17685
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17686
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17687
+ }
17688
+
17689
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17690
+ cb(ffn_inp, "ffn_inp", il);
17691
+
17692
+ cur = ffn_inp;
17693
+ cur = build_norm(cur,
17694
+ model.layers[il].attn_post_norm, nullptr,
17695
+ LLM_NORM_RMS, il);
17696
+ cb(cur, "attn_post_norm", il);
17697
+
17698
+ // MoE branch
17699
+ cur = build_moe_ffn(cur,
17700
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
17701
+ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
17702
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
17703
+ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
17704
+ nullptr,
17705
+ n_expert, n_expert_used,
17706
+ LLM_FFN_SWIGLU_OAI_MOE, false,
17707
+ false, 0.0,
17708
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
17709
+ il);
17710
+ cb(cur, "ffn_moe_out", il);
17711
+
17712
+ cur = ggml_add(ctx0, cur, ffn_inp);
17713
+
17714
+ cur = build_cvec(cur, il);
17715
+ cb(cur, "l_out", il);
17716
+
17717
+ // input for next layer
17718
+ inpL = cur;
17719
+ }
17720
+
17721
+ cur = inpL;
17722
+
17723
+ cur = build_norm(cur,
17724
+ model.output_norm, NULL,
17725
+ LLM_NORM_RMS, -1);
17726
+
17727
+ cb(cur, "result_norm", -1);
17728
+ res->t_embd = cur;
17729
+
17730
+ // lm_head
17731
+ cur = build_lora_mm(model.output, cur);
17732
+
17733
+ cb(cur, "result_output", -1);
17734
+ res->t_logits = cur;
17735
+
17736
+ ggml_build_forward_expand(gf, cur);
17737
+ }
17738
+ };
17739
+
17740
  struct llm_build_lfm2 : public llm_graph_context {
17741
  const llama_model & model;
17742
 
 
17914
  }
17915
  };
17916
 
17917
+ template <bool iswa>
17918
+ struct llm_build_smallthinker : public llm_graph_context{
17919
+ llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
17920
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17921
+
17922
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17923
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17924
+
17925
+ ggml_tensor * cur;
17926
+ ggml_tensor * inpL;
17927
+
17928
+ inpL = build_inp_embd(model.tok_embd);
17929
+
17930
+ // inp_pos - contains the positions
17931
+ ggml_tensor * inp_pos = build_inp_pos();
17932
+
17933
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
17934
+ inp_attn_type * inp_attn = nullptr;
17935
+
17936
+ if constexpr (iswa) {
17937
+ inp_attn = build_attn_inp_kv_unified_iswa();
17938
+ } else {
17939
+ inp_attn = build_attn_inp_kv_unified();
17940
+ }
17941
+
17942
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17943
+
17944
+ for (int il = 0; il < n_layer; ++il) {
17945
+ ggml_tensor * inpSA = inpL;
17946
+ ggml_tensor * probs = nullptr;
17947
+
17948
+ probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
17949
+ cb(probs, "ffn_moe_logits", il);
17950
+
17951
+ // norm
17952
+ cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
17953
+ cb(cur, "attn_norm", il);
17954
+
17955
+ // self_attention
17956
+ {
17957
+ // compute Q and K and RoPE them
17958
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17959
+ cb(Qcur, "Qcur", il);
17960
+
17961
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17962
+ cb(Kcur, "Kcur", il);
17963
+
17964
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17965
+ cb(Vcur, "Vcur", il);
17966
+
17967
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
17968
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
17969
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
17970
+
17971
+ if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
17972
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17973
+ ext_factor, attn_factor, beta_fast, beta_slow);
17974
+
17975
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17976
+ ext_factor, attn_factor, beta_fast, beta_slow);
17977
+ }
17978
+
17979
+ cb(Qcur, "Qcur", il);
17980
+ cb(Kcur, "Kcur", il);
17981
+
17982
+ cur = build_attn(inp_attn,
17983
+ model.layers[il].wo, model.layers[il].bo,
17984
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
17985
+ }
17986
+
17987
+ if (il == n_layer - 1 && inp_out_ids) {
17988
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17989
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17990
+ probs = ggml_get_rows(ctx0, probs, inp_out_ids);
17991
+ }
17992
+
17993
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17994
+ cb(ffn_inp, "ffn_inp", il);
17995
+
17996
+ // MoE branch
17997
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
17998
+ cb(cur, "ffn_norm", il);
17999
+
18000
+ ggml_tensor * ffn_out =
18001
+ build_moe_ffn(cur,
18002
+ nullptr,
18003
+ model.layers[il].ffn_up_exps,
18004
+ model.layers[il].ffn_gate_exps,
18005
+ model.layers[il].ffn_down_exps,
18006
+ nullptr,
18007
+ n_expert, n_expert_used,
18008
+ LLM_FFN_RELU, true,
18009
+ false, 0.0,
18010
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
18011
+ il, probs);
18012
+
18013
+ cb(ffn_out, "ffn_out", il);
18014
+ cur = ffn_out;
18015
+
18016
+ cur = ggml_add(ctx0, cur, ffn_inp);
18017
+ cur = build_cvec(cur, il);
18018
+ cb(cur, "l_out", il);
18019
+
18020
+ // input for next layer
18021
+ inpL = cur;
18022
+ }
18023
+
18024
+ cur = inpL;
18025
+
18026
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
18027
+ cb(cur, "result_norm", -1);
18028
+
18029
+ // lm_head
18030
+ cur = build_lora_mm(model.output, cur);
18031
+ cb(cur, "result_output", -1);
18032
+ res->t_logits = cur;
18033
+
18034
+ ggml_build_forward_expand(gf, cur);
18035
+ }
18036
+ };
18037
+
18038
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
18039
  llama_memory_i * res;
18040
 
 
18048
  case LLM_ARCH_NEO_BERT:
18049
  case LLM_ARCH_WAVTOKENIZER_DEC:
18050
  case LLM_ARCH_DREAM:
18051
+ case LLM_ARCH_LLADA:
18052
  {
18053
  res = nullptr;
18054
  } break;
 
18084
  /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
18085
  /* n_seq_max */ cparams.n_seq_max,
18086
  /* offload */ cparams.offload_kqv,
18087
+ /* unified */ cparams.kv_unified,
18088
  /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18089
  /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18090
  } else {
 
18216
  llm = std::make_unique<llm_build_dream>(*this, params);
18217
  }
18218
  break;
18219
+ case LLM_ARCH_LLADA:
18220
+ {
18221
+ llm = std::make_unique<llm_build_llada>(*this, params);
18222
+ }
18223
+ break;
18224
  case LLM_ARCH_QWEN2VL:
18225
  {
18226
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
 
18363
  {
18364
  llm = std::make_unique<llm_build_glm4>(*this, params);
18365
  } break;
18366
+ case LLM_ARCH_GLM4_MOE:
18367
+ {
18368
+ llm = std::make_unique<llm_build_glm4_moe>(*this, params);
18369
+ } break;
18370
  case LLM_ARCH_BITNET:
18371
  {
18372
  llm = std::make_unique<llm_build_bitnet>(*this, params);
 
18472
  {
18473
  llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
18474
  } break;
18475
+ case LLM_ARCH_HUNYUAN_DENSE:
18476
+ {
18477
+ llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
18478
+ } break;
18479
  case LLM_ARCH_SMOLLM3:
18480
  {
18481
  llm = std::make_unique<llm_build_smollm3>(*this, params);
18482
  } break;
18483
+ case LLM_ARCH_OPENAI_MOE:
18484
+ {
18485
+ llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
18486
+ } break;
18487
  case LLM_ARCH_FALCON_H1:
18488
  {
18489
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
 
18492
  {
18493
  llm = std::make_unique<llm_build_lfm2>(*this, params);
18494
  } break;
18495
+ case LLM_ARCH_SMALLTHINKER:
18496
+ {
18497
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
18498
+ llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
18499
+ } else {
18500
+ llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
18501
+ }
18502
+ } break;
18503
  default:
18504
  GGML_ABORT("fatal error");
18505
  }
 
18529
  /*.use_mmap =*/ true,
18530
  /*.use_mlock =*/ false,
18531
  /*.check_tensors =*/ false,
18532
+ /*.use_extra_bufts =*/ true,
18533
  };
18534
 
18535
  #ifdef GGML_USE_METAL
 
18632
 
18633
  // use what we call a normal RoPE, operating on pairs of consecutive head values
18634
  case LLM_ARCH_LLAMA:
18635
+ case LLM_ARCH_LLADA:
18636
  case LLM_ARCH_LLAMA4:
18637
  case LLM_ARCH_DECI:
18638
  case LLM_ARCH_BAICHUAN:
 
18699
  case LLM_ARCH_MINICPM3:
18700
  case LLM_ARCH_DOTS1:
18701
  case LLM_ARCH_HUNYUAN_MOE:
18702
+ case LLM_ARCH_OPENAI_MOE:
18703
+ case LLM_ARCH_HUNYUAN_DENSE:
18704
  case LLM_ARCH_LFM2:
18705
+ case LLM_ARCH_SMALLTHINKER:
18706
+ case LLM_ARCH_GLM4_MOE:
18707
  return LLAMA_ROPE_TYPE_NEOX;
18708
 
18709
  case LLM_ARCH_QWEN2VL:
 
18814
  return llm_arch_is_recurrent(model->arch);
18815
  }
18816
 
18817
+ bool llama_model_is_diffusion(const llama_model * model) {
18818
+ return llm_arch_is_diffusion(model->arch);
18819
+ }
18820
+
18821
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
18822
  return model->tensors_by_name;
18823
  }
examples/talk-llama/llama-model.h CHANGED
@@ -39,6 +39,7 @@ enum llm_type {
39
  LLM_TYPE_410M,
40
  LLM_TYPE_450M,
41
  LLM_TYPE_475M,
 
42
  LLM_TYPE_700M,
43
  LLM_TYPE_770M,
44
  LLM_TYPE_780M,
@@ -101,8 +102,10 @@ enum llm_type {
101
  LLM_TYPE_A13B,
102
  LLM_TYPE_21B_A3B, // Ernie MoE small
103
  LLM_TYPE_30B_A3B,
 
104
  LLM_TYPE_235B_A22B,
105
  LLM_TYPE_300B_A47B, // Ernie MoE big
 
106
  LLM_TYPE_E2B,
107
  LLM_TYPE_E4B,
108
  };
@@ -166,6 +169,15 @@ struct llama_layer_shortconv {
166
  struct ggml_tensor * out_proj = nullptr;
167
  };
168
 
 
 
 
 
 
 
 
 
 
169
  struct llama_layer {
170
  // normalization
171
  struct ggml_tensor * attn_norm = nullptr;
@@ -241,10 +253,14 @@ struct llama_layer {
241
  struct ggml_tensor * ffn_up_enc = nullptr;
242
 
243
  // ff MoE
244
- struct ggml_tensor * ffn_gate_inp = nullptr;
245
- struct ggml_tensor * ffn_gate_exps = nullptr;
246
- struct ggml_tensor * ffn_down_exps = nullptr;
247
- struct ggml_tensor * ffn_up_exps = nullptr;
 
 
 
 
248
 
249
  // ff shared expert (shexp)
250
  struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
@@ -349,11 +365,16 @@ struct llama_layer {
349
  struct ggml_tensor * laurel_r = nullptr;
350
  struct ggml_tensor * laurel_post_norm = nullptr;
351
 
 
 
 
352
  struct llama_layer_posnet posnet;
353
 
354
  struct llama_layer_convnext convnext;
355
 
356
  struct llama_layer_shortconv shortconv;
 
 
357
  };
358
 
359
  struct llama_model {
 
39
  LLM_TYPE_410M,
40
  LLM_TYPE_450M,
41
  LLM_TYPE_475M,
42
+ LLM_TYPE_537M,
43
  LLM_TYPE_700M,
44
  LLM_TYPE_770M,
45
  LLM_TYPE_780M,
 
102
  LLM_TYPE_A13B,
103
  LLM_TYPE_21B_A3B, // Ernie MoE small
104
  LLM_TYPE_30B_A3B,
105
+ LLM_TYPE_106B_A12B, // GLM-4.5-Air
106
  LLM_TYPE_235B_A22B,
107
  LLM_TYPE_300B_A47B, // Ernie MoE big
108
+ LLM_TYPE_355B_A32B, // GLM-4.5
109
  LLM_TYPE_E2B,
110
  LLM_TYPE_E4B,
111
  };
 
169
  struct ggml_tensor * out_proj = nullptr;
170
  };
171
 
172
+ struct llama_layer_nextn {
173
+ struct ggml_tensor * eh_proj = nullptr;
174
+ struct ggml_tensor * embed_tokens = nullptr;
175
+ struct ggml_tensor * enorm = nullptr;
176
+ struct ggml_tensor * hnorm = nullptr;
177
+ struct ggml_tensor * shared_head_head = nullptr;
178
+ struct ggml_tensor * shared_head_norm = nullptr;
179
+ };
180
+
181
  struct llama_layer {
182
  // normalization
183
  struct ggml_tensor * attn_norm = nullptr;
 
253
  struct ggml_tensor * ffn_up_enc = nullptr;
254
 
255
  // ff MoE
256
+ struct ggml_tensor * ffn_gate_inp = nullptr;
257
+ struct ggml_tensor * ffn_gate_exps = nullptr;
258
+ struct ggml_tensor * ffn_down_exps = nullptr;
259
+ struct ggml_tensor * ffn_up_exps = nullptr;
260
+ struct ggml_tensor * ffn_gate_inp_b = nullptr;
261
+ struct ggml_tensor * ffn_gate_exps_b = nullptr;
262
+ struct ggml_tensor * ffn_down_exps_b = nullptr;
263
+ struct ggml_tensor * ffn_up_exps_b = nullptr;
264
 
265
  // ff shared expert (shexp)
266
  struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
 
365
  struct ggml_tensor * laurel_r = nullptr;
366
  struct ggml_tensor * laurel_post_norm = nullptr;
367
 
368
+ // openai-moe
369
+ struct ggml_tensor * attn_sinks = nullptr;
370
+
371
  struct llama_layer_posnet posnet;
372
 
373
  struct llama_layer_convnext convnext;
374
 
375
  struct llama_layer_shortconv shortconv;
376
+
377
+ struct llama_layer_nextn nextn;
378
  };
379
 
380
  struct llama_model {
examples/talk-llama/llama-quant.cpp CHANGED
@@ -211,7 +211,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
211
  const int64_t nx = tensor->ne[0];
212
  const int64_t qk_k = ggml_blck_size(new_type);
213
 
214
- if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
 
 
 
215
  new_type = GGML_TYPE_Q8_0;
216
  }
217
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -223,6 +226,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
223
  new_type = GGML_TYPE_Q6_K;
224
  }
225
  }
 
 
 
 
 
 
 
 
226
  } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
227
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
228
  new_type = qs.params->token_embedding_type;
@@ -533,6 +544,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
533
  case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
534
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
535
 
 
 
536
  // K-quants
537
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
538
  case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
@@ -875,9 +888,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
875
 
876
  // get more optimal quantization type based on the tensor shape, layer, etc.
877
  if (!params->pure && ggml_is_quantized(default_type)) {
 
878
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
879
- // unless the user specifies a type
880
- if (params->tensor_types) {
881
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
882
  const std::string tensor_name(tensor->name);
883
  for (const auto & [tname, qtype] : tensor_types) {
@@ -890,7 +904,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
890
  }
891
  }
892
  }
893
-
894
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
895
  new_type = params->token_embedding_type;
896
  }
@@ -984,6 +997,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
984
  const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
985
 
986
  new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
987
  }
988
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
989
  }
 
211
  const int64_t nx = tensor->ne[0];
212
  const int64_t qk_k = ggml_blck_size(new_type);
213
 
214
+ if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
215
+ new_type = GGML_TYPE_Q8_0;
216
+ }
217
+ else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
218
  new_type = GGML_TYPE_Q8_0;
219
  }
220
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
 
226
  new_type = GGML_TYPE_Q6_K;
227
  }
228
  }
229
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
230
+ // MoE tensors -> MXFP4
231
+ // other tensors -> Q8_0
232
+ if (tensor->ne[2] > 1) {
233
+ new_type = GGML_TYPE_MXFP4;
234
+ } else {
235
+ new_type = GGML_TYPE_Q8_0;
236
+ }
237
  } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
238
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
239
  new_type = qs.params->token_embedding_type;
 
544
  case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
545
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
546
 
547
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
548
+
549
  // K-quants
550
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
551
  case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
 
888
 
889
  // get more optimal quantization type based on the tensor shape, layer, etc.
890
  if (!params->pure && ggml_is_quantized(default_type)) {
891
+ int fallback = qs.n_fallback;
892
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
893
+ // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
894
+ if (params->tensor_types && qs.n_fallback - fallback == 0) {
895
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
896
  const std::string tensor_name(tensor->name);
897
  for (const auto & [tname, qtype] : tensor_types) {
 
904
  }
905
  }
906
  }
 
907
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
908
  new_type = params->token_embedding_type;
909
  }
 
997
  const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
998
 
999
  new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
1000
+
1001
+ // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
1002
+ #if 0
1003
+ if (new_type == GGML_TYPE_MXFP4) {
1004
+ auto * x = f32_data_03;
1005
+
1006
+ //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
1007
+ std::vector<float> deq(nrows*n_per_row);
1008
+ const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
1009
+ qtype->to_float(new_data_03, deq.data(), deq.size());
1010
+
1011
+ double err = 0.0f;
1012
+ for (int i = 0; i < (int) deq.size(); ++i) {
1013
+ err += fabsf(deq[i] - x[i]);
1014
+ //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
1015
+ if (deq[i] != x[i]) {
1016
+ LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
1017
+ }
1018
+ }
1019
+ //LLAMA_LOG_INFO("err = %f\n", err);
1020
+ GGML_ASSERT(err == 0.00000);
1021
+ }
1022
+ #endif
1023
  }
1024
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
1025
  }
examples/talk-llama/llama-vocab.cpp CHANGED
@@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
307
  };
308
  break;
309
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
 
310
  regex_exprs = {
311
  "\\p{N}{1,3}",
312
  "[一-龥぀-ゟ゠-ヿ]+",
@@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1855
  tokenizer_pre == "gigachat" ||
1856
  tokenizer_pre == "jina-v2-es" ||
1857
  tokenizer_pre == "jina-v2-de" ||
1858
- tokenizer_pre == "a.x-4.0") {
 
1859
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1860
  } else if (
1861
  tokenizer_pre == "jina-v1-en" ||
@@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1964
  tokenizer_pre == "hunyuan") {
1965
  pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1966
  clean_spaces = false;
 
 
 
 
1967
  } else if (
1968
  tokenizer_pre == "kimi-k2") {
1969
  pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
@@ -2185,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2185
  || t.first == "<|fim▁begin|>" // DeepSeek
2186
  || t.first == "<PRE>"
2187
  || t.first == "▁<PRE>" // CodeLlama
 
2188
  ) {
2189
  special_fim_pre_id = t.second;
2190
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2204,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2204
  || t.first == "<|fim▁hole|>" // DeepSeek
2205
  || t.first == "<SUF>"
2206
  || t.first == "▁<SUF>" // CodeLlama
 
2207
  ) {
2208
  special_fim_suf_id = t.second;
2209
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2223,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2223
  || t.first == "<|fim▁end|>" // DeepSeek
2224
  || t.first == "<MID>"
2225
  || t.first == "▁<MID>" // CodeLlama
 
2226
  ) {
2227
  special_fim_mid_id = t.second;
2228
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2305,6 +2314,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2305
  || t.first == "<|eot_id|>"
2306
  || t.first == "<|im_end|>"
2307
  || t.first == "<|end|>"
 
 
2308
  || t.first == "<end_of_turn>"
2309
  || t.first == "<|endoftext|>"
2310
  || t.first == "<|eom_id|>"
@@ -2328,6 +2339,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2328
  }
2329
  }
2330
 
 
 
 
 
 
 
 
2331
  // sanity checks
2332
  if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2333
  special_eog_ids.insert(special_eos_id);
@@ -2343,6 +2361,37 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2343
  special_eog_ids.insert(special_eom_id);
2344
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2345
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2346
  }
2347
 
2348
  // build special tokens cache
 
307
  };
308
  break;
309
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
311
  regex_exprs = {
312
  "\\p{N}{1,3}",
313
  "[一-龥぀-ゟ゠-ヿ]+",
 
1856
  tokenizer_pre == "gigachat" ||
1857
  tokenizer_pre == "jina-v2-es" ||
1858
  tokenizer_pre == "jina-v2-de" ||
1859
+ tokenizer_pre == "a.x-4.0" ||
1860
+ tokenizer_pre == "mellum") {
1861
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1862
  } else if (
1863
  tokenizer_pre == "jina-v1-en" ||
 
1966
  tokenizer_pre == "hunyuan") {
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1968
  clean_spaces = false;
1969
+ } else if (
1970
+ tokenizer_pre == "hunyuan-dense") {
1971
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1972
+ clean_spaces = false;
1973
  } else if (
1974
  tokenizer_pre == "kimi-k2") {
1975
  pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
 
2191
  || t.first == "<|fim▁begin|>" // DeepSeek
2192
  || t.first == "<PRE>"
2193
  || t.first == "▁<PRE>" // CodeLlama
2194
+ || t.first == "<|code_prefix|>" // GLM-4.5
2195
  ) {
2196
  special_fim_pre_id = t.second;
2197
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
 
2211
  || t.first == "<|fim▁hole|>" // DeepSeek
2212
  || t.first == "<SUF>"
2213
  || t.first == "▁<SUF>" // CodeLlama
2214
+ || t.first == "<|code_suffix|>" // GLM-4.5
2215
  ) {
2216
  special_fim_suf_id = t.second;
2217
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
 
2231
  || t.first == "<|fim▁end|>" // DeepSeek
2232
  || t.first == "<MID>"
2233
  || t.first == "▁<MID>" // CodeLlama
2234
+ || t.first == "<|code_middle|>" // GLM-4.5
2235
  ) {
2236
  special_fim_mid_id = t.second;
2237
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
 
2314
  || t.first == "<|eot_id|>"
2315
  || t.first == "<|im_end|>"
2316
  || t.first == "<|end|>"
2317
+ || t.first == "<|return|>" // o200k_harmony
2318
+ || t.first == "<|call|>" // o200k_harmony
2319
  || t.first == "<end_of_turn>"
2320
  || t.first == "<|endoftext|>"
2321
  || t.first == "<|eom_id|>"
 
2339
  }
2340
  }
2341
 
2342
+ // @ngxson : quick hack for gpt-oss, always render these tokens
2343
+ for (const auto & t : token_to_id) {
2344
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2345
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2346
+ }
2347
+ }
2348
+
2349
  // sanity checks
2350
  if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2351
  special_eog_ids.insert(special_eos_id);
 
2361
  special_eog_ids.insert(special_eom_id);
2362
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2363
  }
2364
+
2365
+ // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2366
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2367
+ // we remove the "<|end|>" token from the EOG list
2368
+ {
2369
+ bool has_return = false;
2370
+ bool has_call = false;
2371
+ bool has_end = false;
2372
+
2373
+ llama_token end_id = LLAMA_TOKEN_NULL;
2374
+
2375
+ LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2376
+ for (auto tid : special_eog_ids) {
2377
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2378
+
2379
+ if (id_to_token[tid].text == "<|return|>") {
2380
+ has_return = true;
2381
+ } else if (id_to_token[tid].text == "<|call|>") {
2382
+ has_call = true;
2383
+ } else if (id_to_token[tid].text == "<|end|>") {
2384
+ has_end = true;
2385
+ end_id = tid;
2386
+ }
2387
+ }
2388
+
2389
+ if (has_return && has_call && has_end) {
2390
+ special_eog_ids.erase(end_id);
2391
+ id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2392
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2393
+ }
2394
+ }
2395
  }
2396
 
2397
  // build special tokens cache
examples/talk-llama/llama-vocab.h CHANGED
@@ -46,6 +46,7 @@ enum llama_vocab_pre_type {
46
  LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
  LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
 
49
  };
50
 
51
  struct LLM_KV;
 
46
  LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
  LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
  };
51
 
52
  struct LLM_KV;
examples/talk-llama/llama.h CHANGED
@@ -152,6 +152,7 @@ extern "C" {
152
  //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
153
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
154
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
 
155
 
156
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
157
  };
@@ -284,10 +285,11 @@ extern "C" {
284
  const struct llama_model_kv_override * kv_overrides;
285
 
286
  // Keep the booleans together to avoid misalignment during copy-by-value.
287
- bool vocab_only; // only load the vocabulary, no weights
288
- bool use_mmap; // use mmap if possible
289
- bool use_mlock; // force system to keep model in RAM
290
- bool check_tensors; // validate model tensor data
 
291
  };
292
 
293
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -537,6 +539,9 @@ extern "C" {
537
  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
538
  LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
539
 
 
 
 
540
  // Returns 0 on success
541
  LLAMA_API uint32_t llama_model_quantize(
542
  const char * fname_inp,
@@ -865,6 +870,29 @@ extern "C" {
865
  size_t n_token_capacity,
866
  size_t * n_token_count_out);
867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  //
869
  // Decoding
870
  //
@@ -1432,6 +1460,8 @@ extern "C" {
1432
 
1433
  ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1434
  void * get_opt_pars_ud; // userdata for calculating optimizer parameters
 
 
1435
  };
1436
 
1437
  LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
 
152
  //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
153
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
154
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
155
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
156
 
157
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
158
  };
 
285
  const struct llama_model_kv_override * kv_overrides;
286
 
287
  // Keep the booleans together to avoid misalignment during copy-by-value.
288
+ bool vocab_only; // only load the vocabulary, no weights
289
+ bool use_mmap; // use mmap if possible
290
+ bool use_mlock; // force system to keep model in RAM
291
+ bool check_tensors; // validate model tensor data
292
+ bool use_extra_bufts; // use extra buffer types (used for weight repacking)
293
  };
294
 
295
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
 
539
  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
540
  LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
541
 
542
+ // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
543
+ LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
544
+
545
  // Returns 0 on success
546
  LLAMA_API uint32_t llama_model_quantize(
547
  const char * fname_inp,
 
870
  size_t n_token_capacity,
871
  size_t * n_token_count_out);
872
 
873
+ #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
874
+
875
+ typedef uint32_t llama_state_seq_flags;
876
+
877
+ LLAMA_API size_t llama_state_seq_get_size_ext(
878
+ struct llama_context * ctx,
879
+ llama_seq_id seq_id,
880
+ llama_state_seq_flags flags);
881
+
882
+ LLAMA_API size_t llama_state_seq_get_data_ext(
883
+ struct llama_context * ctx,
884
+ uint8_t * dst,
885
+ size_t size,
886
+ llama_seq_id seq_id,
887
+ llama_state_seq_flags flags);
888
+
889
+ LLAMA_API size_t llama_state_seq_set_data_ext(
890
+ struct llama_context * ctx,
891
+ const uint8_t * src,
892
+ size_t size,
893
+ llama_seq_id dest_seq_id,
894
+ llama_state_seq_flags flags);
895
+
896
  //
897
  // Decoding
898
  //
 
1460
 
1461
  ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1462
  void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1463
+
1464
+ enum ggml_opt_optimizer_type optimizer_type;
1465
  };
1466
 
1467
  LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);