Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +73 -47
- examples/talk-llama/llama.h +16 -14
examples/talk-llama/llama.cpp
CHANGED
|
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 509 |
{
|
| 510 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 511 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 512 |
-
{ LLM_TENSOR_OUTPUT, "output" },
|
| 513 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 514 |
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 515 |
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
@@ -851,9 +850,9 @@ struct LLM_TN {
|
|
| 851 |
//
|
| 852 |
|
| 853 |
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
| 854 |
-
{
|
| 855 |
-
{
|
| 856 |
-
{
|
| 857 |
};
|
| 858 |
|
| 859 |
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
@@ -863,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
| 863 |
}
|
| 864 |
}
|
| 865 |
|
| 866 |
-
return
|
| 867 |
}
|
| 868 |
|
| 869 |
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
|
@@ -1581,7 +1580,7 @@ struct llama_hparams {
|
|
| 1581 |
bool causal_attn = true;
|
| 1582 |
bool need_kq_pos = false;
|
| 1583 |
|
| 1584 |
-
uint32_t pooling_type =
|
| 1585 |
|
| 1586 |
bool operator!=(const llama_hparams & other) const {
|
| 1587 |
if (this->vocab_only != other.vocab_only) return true;
|
|
@@ -2346,9 +2345,9 @@ namespace GGUFMeta {
|
|
| 2346 |
|
| 2347 |
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
| 2348 |
switch (ty) {
|
| 2349 |
-
case
|
| 2350 |
-
case
|
| 2351 |
-
case
|
| 2352 |
}
|
| 2353 |
return "unknown";
|
| 2354 |
}
|
|
@@ -2359,13 +2358,13 @@ namespace GGUFMeta {
|
|
| 2359 |
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
| 2360 |
__func__, override_type_to_str(override->tag), override->key);
|
| 2361 |
switch (override->tag) {
|
| 2362 |
-
case
|
| 2363 |
LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
|
| 2364 |
} break;
|
| 2365 |
-
case
|
| 2366 |
LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
|
| 2367 |
} break;
|
| 2368 |
-
case
|
| 2369 |
LLAMA_LOG_INFO("%.6f\n", override->float_value);
|
| 2370 |
} break;
|
| 2371 |
default:
|
|
@@ -2384,7 +2383,7 @@ namespace GGUFMeta {
|
|
| 2384 |
template<typename OT>
|
| 2385 |
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
| 2386 |
try_override(OT & target, const struct llama_model_kv_override *override) {
|
| 2387 |
-
if (validate_override(
|
| 2388 |
target = override->bool_value;
|
| 2389 |
return true;
|
| 2390 |
}
|
|
@@ -2394,7 +2393,7 @@ namespace GGUFMeta {
|
|
| 2394 |
template<typename OT>
|
| 2395 |
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
| 2396 |
try_override(OT & target, const struct llama_model_kv_override *override) {
|
| 2397 |
-
if (validate_override(
|
| 2398 |
target = override->int_value;
|
| 2399 |
return true;
|
| 2400 |
}
|
|
@@ -2404,7 +2403,7 @@ namespace GGUFMeta {
|
|
| 2404 |
template<typename OT>
|
| 2405 |
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
| 2406 |
try_override(T & target, const struct llama_model_kv_override *override) {
|
| 2407 |
-
if (validate_override(
|
| 2408 |
target = override->float_value;
|
| 2409 |
return true;
|
| 2410 |
}
|
|
@@ -2546,6 +2545,7 @@ struct llama_model_loader {
|
|
| 2546 |
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
| 2547 |
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
| 2548 |
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
|
|
|
| 2549 |
default:
|
| 2550 |
{
|
| 2551 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -2891,6 +2891,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2891 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
| 2892 |
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
| 2893 |
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
|
|
|
|
|
|
| 2894 |
|
| 2895 |
default: return "unknown, may not work";
|
| 2896 |
}
|
|
@@ -2997,7 +2999,7 @@ static void llm_load_hparams(
|
|
| 2997 |
std::string rope_scaling("linear");
|
| 2998 |
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
| 2999 |
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
| 3000 |
-
GGML_ASSERT(hparams.rope_scaling_type_train !=
|
| 3001 |
|
| 3002 |
// rope_freq_scale (inverse of the kv) is optional
|
| 3003 |
float ropescale = 0.0f;
|
|
@@ -3641,7 +3643,7 @@ static bool llm_load_tensors(
|
|
| 3641 |
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
| 3642 |
}
|
| 3643 |
|
| 3644 |
-
if (split_mode ==
|
| 3645 |
// calculate the split points
|
| 3646 |
int device_count = llama_get_device_count();
|
| 3647 |
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
@@ -3680,10 +3682,10 @@ static bool llm_load_tensors(
|
|
| 3680 |
}
|
| 3681 |
} else {
|
| 3682 |
ggml_backend_buffer_type_t split_buft;
|
| 3683 |
-
if (split_mode ==
|
| 3684 |
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
| 3685 |
} else {
|
| 3686 |
-
//
|
| 3687 |
split_buft = llama_default_buffer_type_offload(main_gpu);
|
| 3688 |
}
|
| 3689 |
// assign the repeating layers
|
|
@@ -4056,7 +4058,10 @@ static bool llm_load_tensors(
|
|
| 4056 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 4057 |
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
| 4058 |
|
| 4059 |
-
|
|
|
|
|
|
|
|
|
|
| 4060 |
}
|
| 4061 |
|
| 4062 |
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -5065,7 +5070,7 @@ struct llm_build_context {
|
|
| 5065 |
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
| 5066 |
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
| 5067 |
do_rope_shift (worst_case || kv_self.has_shift),
|
| 5068 |
-
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)
|
| 5069 |
cb (cb),
|
| 5070 |
buf_compute_meta (lctx.buf_compute_meta) {
|
| 5071 |
// all initializations should be done in init()
|
|
@@ -6045,12 +6050,12 @@ struct llm_build_context {
|
|
| 6045 |
cur = inpL;
|
| 6046 |
|
| 6047 |
// pooling layer
|
| 6048 |
-
if (pooling_type ==
|
| 6049 |
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
| 6050 |
-
} else if (pooling_type ==
|
| 6051 |
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
| 6052 |
} else {
|
| 6053 |
-
GGML_ASSERT(pooling_type ==
|
| 6054 |
}
|
| 6055 |
cb(cur, "result_embd", -1);
|
| 6056 |
|
|
@@ -7749,7 +7754,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
| 7749 |
}
|
| 7750 |
}
|
| 7751 |
|
| 7752 |
-
if (cparams.do_pooling && hparams.pooling_type ==
|
| 7753 |
const int64_t n_tokens = batch.n_tokens;
|
| 7754 |
|
| 7755 |
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
|
@@ -7777,7 +7782,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
| 7777 |
}
|
| 7778 |
}
|
| 7779 |
|
| 7780 |
-
if (cparams.do_pooling && hparams.pooling_type ==
|
| 7781 |
const int64_t n_tokens = batch.n_tokens;
|
| 7782 |
|
| 7783 |
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
|
@@ -10542,6 +10547,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10542 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 10543 |
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
|
| 10544 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10545 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 10546 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 10547 |
}
|
|
@@ -10573,13 +10584,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10573 |
new_type = GGML_TYPE_Q8_0;
|
| 10574 |
}
|
| 10575 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
| 10576 |
-
new_type =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10577 |
}
|
| 10578 |
} else if (name.find("ffn_down") != std::string::npos) {
|
| 10579 |
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
| 10580 |
int i_layer = info.first, n_layer = info.second;
|
| 10581 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 10582 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S
|
| 10583 |
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
| 10584 |
}
|
| 10585 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
|
@@ -10590,6 +10605,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10590 |
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
| 10591 |
: GGML_TYPE_Q3_K;
|
| 10592 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10593 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
| 10594 |
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
| 10595 |
}
|
|
@@ -10621,37 +10640,41 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10621 |
if (qs.model.hparams.n_expert == 8) {
|
| 10622 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
| 10623 |
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
| 10624 |
-
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M
|
|
|
|
| 10625 |
new_type = GGML_TYPE_Q5_K;
|
| 10626 |
}
|
| 10627 |
} else {
|
| 10628 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K
|
| 10629 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
| 10630 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
| 10631 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
|
|
| 10632 |
}
|
| 10633 |
} else {
|
| 10634 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
| 10635 |
}
|
| 10636 |
}
|
| 10637 |
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
| 10638 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L
|
|
|
|
|
|
|
| 10639 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
| 10640 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
| 10641 |
}
|
| 10642 |
else if (name.find("ffn_gate") != std::string::npos) {
|
| 10643 |
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
| 10644 |
int i_layer = info.first, n_layer = info.second;
|
| 10645 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS &&
|
| 10646 |
-
new_type =
|
| 10647 |
}
|
| 10648 |
++qs.i_ffn_gate;
|
| 10649 |
}
|
| 10650 |
else if (name.find("ffn_up") != std::string::npos) {
|
| 10651 |
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
| 10652 |
int i_layer = info.first, n_layer = info.second;
|
| 10653 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS &&
|
| 10654 |
-
new_type =
|
| 10655 |
}
|
| 10656 |
++qs.i_ffn_up;
|
| 10657 |
}
|
|
@@ -10671,7 +10694,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10671 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 10672 |
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
| 10673 |
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
| 10674 |
-
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
| 10675 |
int nx = tensor->ne[0];
|
| 10676 |
int ny = tensor->ne[1];
|
| 10677 |
if (nx % QK_K != 0) {
|
|
@@ -10686,6 +10709,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10686 |
case GGML_TYPE_IQ2_XXS:
|
| 10687 |
case GGML_TYPE_IQ2_XS:
|
| 10688 |
case GGML_TYPE_IQ3_XXS:
|
|
|
|
| 10689 |
case GGML_TYPE_IQ1_S:
|
| 10690 |
case GGML_TYPE_Q2_K:
|
| 10691 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
|
@@ -10717,7 +10741,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 10717 |
// K-quants
|
| 10718 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
| 10719 |
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
| 10720 |
-
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
| 10721 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 10722 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 10723 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
@@ -10731,6 +10755,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 10731 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
| 10732 |
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
| 10733 |
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
|
|
|
|
|
|
| 10734 |
|
| 10735 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 10736 |
}
|
|
@@ -11325,7 +11351,7 @@ static int llama_apply_lora_from_file_internal(
|
|
| 11325 |
struct llama_model_params llama_model_default_params() {
|
| 11326 |
struct llama_model_params result = {
|
| 11327 |
/*.n_gpu_layers =*/ 0,
|
| 11328 |
-
/*.split_mode =*/
|
| 11329 |
/*.main_gpu =*/ 0,
|
| 11330 |
/*.tensor_split =*/ nullptr,
|
| 11331 |
/*.progress_callback =*/ nullptr,
|
|
@@ -11351,7 +11377,7 @@ struct llama_context_params llama_context_default_params() {
|
|
| 11351 |
/*.n_batch =*/ 512,
|
| 11352 |
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
| 11353 |
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
| 11354 |
-
/*.rope_scaling_type =*/
|
| 11355 |
/*.rope_freq_base =*/ 0.0f,
|
| 11356 |
/*.rope_freq_scale =*/ 0.0f,
|
| 11357 |
/*.yarn_ext_factor =*/ -1.0f,
|
|
@@ -11539,16 +11565,16 @@ struct llama_context * llama_new_context_with_model(
|
|
| 11539 |
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
| 11540 |
|
| 11541 |
auto rope_scaling_type = params.rope_scaling_type;
|
| 11542 |
-
if (rope_scaling_type ==
|
| 11543 |
rope_scaling_type = hparams.rope_scaling_type_train;
|
| 11544 |
}
|
| 11545 |
|
| 11546 |
-
if (rope_scaling_type ==
|
| 11547 |
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
| 11548 |
}
|
| 11549 |
|
| 11550 |
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
| 11551 |
-
cparams.yarn_ext_factor = rope_scaling_type ==
|
| 11552 |
}
|
| 11553 |
|
| 11554 |
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
@@ -11582,8 +11608,8 @@ struct llama_context * llama_new_context_with_model(
|
|
| 11582 |
}
|
| 11583 |
#elif defined(GGML_USE_CUBLAS)
|
| 11584 |
if (model->n_gpu_layers > 0) {
|
| 11585 |
-
// with split_mode
|
| 11586 |
-
if (model->split_mode ==
|
| 11587 |
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
| 11588 |
if (backend == nullptr) {
|
| 11589 |
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
|
@@ -11592,7 +11618,7 @@ struct llama_context * llama_new_context_with_model(
|
|
| 11592 |
}
|
| 11593 |
ctx->backends.push_back(backend);
|
| 11594 |
} else {
|
| 11595 |
-
//
|
| 11596 |
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
| 11597 |
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
| 11598 |
if (backend == nullptr) {
|
|
|
|
| 509 |
{
|
| 510 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 511 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
|
|
| 512 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 513 |
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 514 |
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
|
|
| 850 |
//
|
| 851 |
|
| 852 |
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
| 853 |
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
| 854 |
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
| 855 |
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
| 856 |
};
|
| 857 |
|
| 858 |
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
|
|
| 862 |
}
|
| 863 |
}
|
| 864 |
|
| 865 |
+
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
| 866 |
}
|
| 867 |
|
| 868 |
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
|
|
|
| 1580 |
bool causal_attn = true;
|
| 1581 |
bool need_kq_pos = false;
|
| 1582 |
|
| 1583 |
+
uint32_t pooling_type = LLAMA_POOLING_TYPE_NONE;
|
| 1584 |
|
| 1585 |
bool operator!=(const llama_hparams & other) const {
|
| 1586 |
if (this->vocab_only != other.vocab_only) return true;
|
|
|
|
| 2345 |
|
| 2346 |
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
| 2347 |
switch (ty) {
|
| 2348 |
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
| 2349 |
+
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
| 2350 |
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
| 2351 |
}
|
| 2352 |
return "unknown";
|
| 2353 |
}
|
|
|
|
| 2358 |
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
| 2359 |
__func__, override_type_to_str(override->tag), override->key);
|
| 2360 |
switch (override->tag) {
|
| 2361 |
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
| 2362 |
LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
|
| 2363 |
} break;
|
| 2364 |
+
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
| 2365 |
LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
|
| 2366 |
} break;
|
| 2367 |
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
| 2368 |
LLAMA_LOG_INFO("%.6f\n", override->float_value);
|
| 2369 |
} break;
|
| 2370 |
default:
|
|
|
|
| 2383 |
template<typename OT>
|
| 2384 |
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
| 2385 |
try_override(OT & target, const struct llama_model_kv_override *override) {
|
| 2386 |
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, override)) {
|
| 2387 |
target = override->bool_value;
|
| 2388 |
return true;
|
| 2389 |
}
|
|
|
|
| 2393 |
template<typename OT>
|
| 2394 |
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
| 2395 |
try_override(OT & target, const struct llama_model_kv_override *override) {
|
| 2396 |
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, override)) {
|
| 2397 |
target = override->int_value;
|
| 2398 |
return true;
|
| 2399 |
}
|
|
|
|
| 2403 |
template<typename OT>
|
| 2404 |
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
| 2405 |
try_override(T & target, const struct llama_model_kv_override *override) {
|
| 2406 |
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, override)) {
|
| 2407 |
target = override->float_value;
|
| 2408 |
return true;
|
| 2409 |
}
|
|
|
|
| 2545 |
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
| 2546 |
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
| 2547 |
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
| 2548 |
+
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
| 2549 |
default:
|
| 2550 |
{
|
| 2551 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
|
|
| 2891 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
| 2892 |
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
| 2893 |
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
| 2894 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
| 2895 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
| 2896 |
|
| 2897 |
default: return "unknown, may not work";
|
| 2898 |
}
|
|
|
|
| 2999 |
std::string rope_scaling("linear");
|
| 3000 |
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
| 3001 |
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
| 3002 |
+
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
| 3003 |
|
| 3004 |
// rope_freq_scale (inverse of the kv) is optional
|
| 3005 |
float ropescale = 0.0f;
|
|
|
|
| 3643 |
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
| 3644 |
}
|
| 3645 |
|
| 3646 |
+
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
| 3647 |
// calculate the split points
|
| 3648 |
int device_count = llama_get_device_count();
|
| 3649 |
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
|
|
| 3682 |
}
|
| 3683 |
} else {
|
| 3684 |
ggml_backend_buffer_type_t split_buft;
|
| 3685 |
+
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 3686 |
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
| 3687 |
} else {
|
| 3688 |
+
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
| 3689 |
split_buft = llama_default_buffer_type_offload(main_gpu);
|
| 3690 |
}
|
| 3691 |
// assign the repeating layers
|
|
|
|
| 4058 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 4059 |
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
| 4060 |
|
| 4061 |
+
// same as tok_embd, duplicated to allow offloading
|
| 4062 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 4063 |
+
ml.n_created--; // artificial tensor
|
| 4064 |
+
ml.size_data += ggml_nbytes(model.output);
|
| 4065 |
}
|
| 4066 |
|
| 4067 |
for (int i = 0; i < n_layer; ++i) {
|
|
|
|
| 5070 |
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
| 5071 |
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
| 5072 |
do_rope_shift (worst_case || kv_self.has_shift),
|
| 5073 |
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_TYPE_NONE),
|
| 5074 |
cb (cb),
|
| 5075 |
buf_compute_meta (lctx.buf_compute_meta) {
|
| 5076 |
// all initializations should be done in init()
|
|
|
|
| 6050 |
cur = inpL;
|
| 6051 |
|
| 6052 |
// pooling layer
|
| 6053 |
+
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
| 6054 |
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
| 6055 |
+
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
| 6056 |
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
| 6057 |
} else {
|
| 6058 |
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
|
| 6059 |
}
|
| 6060 |
cb(cur, "result_embd", -1);
|
| 6061 |
|
|
|
|
| 7754 |
}
|
| 7755 |
}
|
| 7756 |
|
| 7757 |
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
| 7758 |
const int64_t n_tokens = batch.n_tokens;
|
| 7759 |
|
| 7760 |
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
|
|
|
| 7782 |
}
|
| 7783 |
}
|
| 7784 |
|
| 7785 |
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
| 7786 |
const int64_t n_tokens = batch.n_tokens;
|
| 7787 |
|
| 7788 |
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
|
|
|
| 10547 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 10548 |
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
|
| 10549 |
}
|
| 10550 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
| 10551 |
+
new_type = GGML_TYPE_Q4_K;
|
| 10552 |
+
}
|
| 10553 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
| 10554 |
+
new_type = GGML_TYPE_Q4_K;
|
| 10555 |
+
}
|
| 10556 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 10557 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 10558 |
}
|
|
|
|
| 10584 |
new_type = GGML_TYPE_Q8_0;
|
| 10585 |
}
|
| 10586 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
| 10587 |
+
new_type = GGML_TYPE_IQ3_XXS;
|
| 10588 |
+
}
|
| 10589 |
+
} else if (name.find("attn_q.weight") != std::string::npos) {
|
| 10590 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
| 10591 |
+
new_type = GGML_TYPE_IQ3_XXS;
|
| 10592 |
}
|
| 10593 |
} else if (name.find("ffn_down") != std::string::npos) {
|
| 10594 |
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
| 10595 |
int i_layer = info.first, n_layer = info.second;
|
| 10596 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 10597 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
| 10598 |
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
| 10599 |
}
|
| 10600 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
|
|
|
| 10605 |
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
| 10606 |
: GGML_TYPE_Q3_K;
|
| 10607 |
}
|
| 10608 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
| 10609 |
+
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
| 10610 |
+
new_type = GGML_TYPE_Q4_K;
|
| 10611 |
+
}
|
| 10612 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
| 10613 |
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
| 10614 |
}
|
|
|
|
| 10640 |
if (qs.model.hparams.n_expert == 8) {
|
| 10641 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
| 10642 |
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
| 10643 |
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
| 10644 |
+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
| 10645 |
new_type = GGML_TYPE_Q5_K;
|
| 10646 |
}
|
| 10647 |
} else {
|
| 10648 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
| 10649 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
| 10650 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
| 10651 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
| 10652 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
| 10653 |
}
|
| 10654 |
} else {
|
| 10655 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
| 10656 |
}
|
| 10657 |
}
|
| 10658 |
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
| 10659 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
| 10660 |
+
new_type = GGML_TYPE_Q4_K;
|
| 10661 |
+
}
|
| 10662 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
| 10663 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
| 10664 |
}
|
| 10665 |
else if (name.find("ffn_gate") != std::string::npos) {
|
| 10666 |
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
| 10667 |
int i_layer = info.first, n_layer = info.second;
|
| 10668 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
| 10669 |
+
new_type = GGML_TYPE_IQ3_XXS;
|
| 10670 |
}
|
| 10671 |
++qs.i_ffn_gate;
|
| 10672 |
}
|
| 10673 |
else if (name.find("ffn_up") != std::string::npos) {
|
| 10674 |
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
| 10675 |
int i_layer = info.first, n_layer = info.second;
|
| 10676 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
| 10677 |
+
new_type = GGML_TYPE_IQ3_XXS;
|
| 10678 |
}
|
| 10679 |
++qs.i_ffn_up;
|
| 10680 |
}
|
|
|
|
| 10694 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 10695 |
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
| 10696 |
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
| 10697 |
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
|
| 10698 |
int nx = tensor->ne[0];
|
| 10699 |
int ny = tensor->ne[1];
|
| 10700 |
if (nx % QK_K != 0) {
|
|
|
|
| 10709 |
case GGML_TYPE_IQ2_XXS:
|
| 10710 |
case GGML_TYPE_IQ2_XS:
|
| 10711 |
case GGML_TYPE_IQ3_XXS:
|
| 10712 |
+
case GGML_TYPE_IQ3_S:
|
| 10713 |
case GGML_TYPE_IQ1_S:
|
| 10714 |
case GGML_TYPE_Q2_K:
|
| 10715 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
|
|
|
| 10741 |
// K-quants
|
| 10742 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
| 10743 |
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
| 10744 |
+
case LLAMA_FTYPE_MOSTLY_Q3_K_XS: quantized_type = GGML_TYPE_IQ3_S; break;
|
| 10745 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 10746 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 10747 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
|
|
| 10755 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
| 10756 |
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
| 10757 |
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
| 10758 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
|
| 10759 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
|
| 10760 |
|
| 10761 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 10762 |
}
|
|
|
|
| 11351 |
struct llama_model_params llama_model_default_params() {
|
| 11352 |
struct llama_model_params result = {
|
| 11353 |
/*.n_gpu_layers =*/ 0,
|
| 11354 |
+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
| 11355 |
/*.main_gpu =*/ 0,
|
| 11356 |
/*.tensor_split =*/ nullptr,
|
| 11357 |
/*.progress_callback =*/ nullptr,
|
|
|
|
| 11377 |
/*.n_batch =*/ 512,
|
| 11378 |
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
| 11379 |
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
| 11380 |
+
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
| 11381 |
/*.rope_freq_base =*/ 0.0f,
|
| 11382 |
/*.rope_freq_scale =*/ 0.0f,
|
| 11383 |
/*.yarn_ext_factor =*/ -1.0f,
|
|
|
|
| 11565 |
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
| 11566 |
|
| 11567 |
auto rope_scaling_type = params.rope_scaling_type;
|
| 11568 |
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
| 11569 |
rope_scaling_type = hparams.rope_scaling_type_train;
|
| 11570 |
}
|
| 11571 |
|
| 11572 |
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
| 11573 |
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
| 11574 |
}
|
| 11575 |
|
| 11576 |
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
| 11577 |
+
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
| 11578 |
}
|
| 11579 |
|
| 11580 |
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
|
|
| 11608 |
}
|
| 11609 |
#elif defined(GGML_USE_CUBLAS)
|
| 11610 |
if (model->n_gpu_layers > 0) {
|
| 11611 |
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
| 11612 |
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 11613 |
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
| 11614 |
if (backend == nullptr) {
|
| 11615 |
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
|
|
|
| 11618 |
}
|
| 11619 |
ctx->backends.push_back(backend);
|
| 11620 |
} else {
|
| 11621 |
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
| 11622 |
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
| 11623 |
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
| 11624 |
if (backend == nullptr) {
|
examples/talk-llama/llama.h
CHANGED
|
@@ -102,28 +102,30 @@ extern "C" {
|
|
| 102 |
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
| 103 |
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
| 104 |
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
|
|
|
|
|
|
| 105 |
|
| 106 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 107 |
};
|
| 108 |
|
| 109 |
enum llama_rope_scaling_type {
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
};
|
| 116 |
|
| 117 |
enum llama_pooling_type {
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
};
|
| 122 |
|
| 123 |
enum llama_split_mode {
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
};
|
| 128 |
|
| 129 |
typedef struct llama_token_data {
|
|
@@ -171,9 +173,9 @@ extern "C" {
|
|
| 171 |
} llama_batch;
|
| 172 |
|
| 173 |
enum llama_model_kv_override_type {
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
};
|
| 178 |
|
| 179 |
struct llama_model_kv_override {
|
|
|
|
| 102 |
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
| 103 |
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
| 104 |
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
| 105 |
+
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
| 106 |
+
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
| 107 |
|
| 108 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 109 |
};
|
| 110 |
|
| 111 |
enum llama_rope_scaling_type {
|
| 112 |
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
| 113 |
+
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
| 114 |
+
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
| 115 |
+
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
| 116 |
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
| 117 |
};
|
| 118 |
|
| 119 |
enum llama_pooling_type {
|
| 120 |
+
LLAMA_POOLING_TYPE_NONE = 0,
|
| 121 |
+
LLAMA_POOLING_TYPE_MEAN = 1,
|
| 122 |
+
LLAMA_POOLING_TYPE_CLS = 2,
|
| 123 |
};
|
| 124 |
|
| 125 |
enum llama_split_mode {
|
| 126 |
+
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
| 127 |
+
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
| 128 |
+
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
| 129 |
};
|
| 130 |
|
| 131 |
typedef struct llama_token_data {
|
|
|
|
| 173 |
} llama_batch;
|
| 174 |
|
| 175 |
enum llama_model_kv_override_type {
|
| 176 |
+
LLAMA_KV_OVERRIDE_TYPE_INT,
|
| 177 |
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
| 178 |
+
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
| 179 |
};
|
| 180 |
|
| 181 |
struct llama_model_kv_override {
|