diff --git "a/examples/talk-llama/llama.cpp" "b/examples/talk-llama/llama.cpp" --- "a/examples/talk-llama/llama.cpp" +++ "b/examples/talk-llama/llama.cpp" @@ -1,5 +1,8 @@ +#define LLAMA_API_INTERNAL #include "llama.h" +#include "unicode.h" + #include "ggml.h" #include "ggml-alloc.h" @@ -16,13 +19,11 @@ #ifdef GGML_USE_MPI # include "ggml-mpi.h" #endif -#ifdef GGML_USE_K_QUANTS -# ifndef QK_K -# ifdef GGML_QKK_64 -# define QK_K 64 -# else -# define QK_K 256 -# endif +#ifndef QK_K +# ifdef GGML_QKK_64 +# define QK_K 64 +# else +# define QK_K 256 # endif #endif @@ -53,13 +54,16 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include +#include #include #include #include @@ -68,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -91,12 +96,12 @@ // LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (llama_log_level level, const char* format, ...); -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); +static void llama_log_internal (ggml_log_level level, const char* format, ...); +static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); -#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) -#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) -#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) // // helpers @@ -108,7 +113,7 @@ static size_t utf8_len(char src) { return lookup[highbits]; } -void replace_all(std::string & s, const std::string & search, const std::string & replace) { +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { std::string result; for (size_t pos = 0; ; pos += search.length()) { auto new_pos = s.find(search, pos); @@ -121,6 +126,27 @@ void replace_all(std::string & s, const std::string & search, const std::string } s = std::move(result); } + +static bool is_float_close(float a, float b, float abs_tol) { + // Check for non-negative tolerance + if (abs_tol < 0.0) { + throw std::invalid_argument("Tolerance must be non-negative"); + } + + // Exact equality check + if (a == b) { + return true; + } + + // Check for infinities + if (std::isinf(a) || std::isinf(b)) { + return false; + } + + // Regular comparison using the provided absolute tolerance + return std::fabs(b - a) <= abs_tol; +} + #ifdef GGML_USE_CPU_HBM #include #endif @@ -160,17 +186,25 @@ enum llm_arch { LLM_ARCH_GPTJ, LLM_ARCH_GPTNEOX, LLM_ARCH_MPT, + LLM_ARCH_STARCODER, + LLM_ARCH_PERSIMMON, + LLM_ARCH_REFACT, + LLM_ARCH_BLOOM, LLM_ARCH_UNKNOWN, }; static std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN,"baichuan" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_PERSIMMON, "persimmon" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BLOOM, "bloom" }, }; enum llm_kv { @@ -202,6 +236,10 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_SCALE_LINEAR, + LLM_KV_ROPE_SCALING_TYPE, + LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, + LLM_KV_ROPE_SCALING_FINETUNED, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, @@ -218,16 +256,16 @@ enum llm_kv { }; static std::map LLM_KV_NAMES = { - { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, - { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, - { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, - { LLM_KV_GENERAL_NAME, "general.name" }, - { LLM_KV_GENERAL_AUTHOR, "general.author" }, - { LLM_KV_GENERAL_URL, "general.url" }, - { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, - { LLM_KV_GENERAL_LICENSE, "general.license" }, - { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" }, - { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" }, + { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, + { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, + { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, + { LLM_KV_GENERAL_NAME, "general.name" }, + { LLM_KV_GENERAL_AUTHOR, "general.author" }, + { LLM_KV_GENERAL_URL, "general.url" }, + { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, + { LLM_KV_GENERAL_LICENSE, "general.license" }, + { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, + { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, @@ -243,9 +281,13 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -273,6 +315,7 @@ struct LLM_KV { enum llm_tensor { LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, LLM_TENSOR_POS_EMBD, LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, @@ -289,6 +332,8 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, LLM_TENSOR_FFN_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, }; static std::map> LLM_TENSOR_NAMES = { @@ -370,10 +415,82 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_PERSIMMON, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd"}, + { LLM_TENSOR_OUTPUT_NORM, "output_norm"}, + { LLM_TENSOR_OUTPUT, "output"}, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"}, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"}, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"}, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"}, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"}, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"}, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"}, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"}, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"}, + }, + }, { LLM_ARCH_MPT, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_STARCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, + { + LLM_ARCH_REFACT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_BLOOM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, }, }, { @@ -430,7 +547,7 @@ struct LLM_TN { // #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ +do { \ const std::string skey(key); \ const int kid = gguf_find_key(ctx, skey.c_str()); \ if (kid >= 0) { \ @@ -442,6 +559,22 @@ struct LLM_TN { } else if (req) { \ throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ } \ +} while (0) + +static std::map LLAMA_ROPE_SCALING_TYPES = { + { LLAMA_ROPE_SCALING_NONE, "none" }, + { LLAMA_ROPE_SCALING_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_YARN, "yarn" }, +}; + +static int8_t llama_rope_scaling_type_from_string(const std::string & name) { + for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { + if (kv.second == name) { + return kv.first; + } + } + + return LLAMA_ROPE_SCALING_UNSPECIFIED; } // @@ -862,18 +995,19 @@ struct llama_mlock { typedef void (*offload_func_t)(struct ggml_tensor * tensor); -static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default +static void ggml_offload_nop(struct ggml_tensor * tensor) { (void) tensor; } -static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { +static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); - } else { + } + else { result.resize(n_tokens); } @@ -886,7 +1020,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to struct llama_state { // We save the log callback globally - llama_log_callback log_callback = llama_log_callback_default; + ggml_log_callback log_callback = llama_log_callback_default; void * log_callback_user_data = nullptr; }; @@ -895,9 +1029,12 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_1B, MODEL_3B, MODEL_7B, + MODEL_8B, MODEL_13B, + MODEL_15B, MODEL_30B, MODEL_34B, MODEL_40B, @@ -906,28 +1043,53 @@ enum e_model { }; static const size_t kB = 1024; -static const size_t MB = kB*kB; +static const size_t MB = 1024*kB; +static const size_t GB = 1024*MB; -// default hparams (LLaMA 7B) struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx_train = 2048; // the context size used during training - uint32_t n_ctx = 512; // the context size used during inference - uint32_t n_embd = 4096; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - uint32_t n_ff = 11008; - - float f_norm_eps = 1e-5; - float f_norm_rms_eps = 1e-5; - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; + bool vocab_only; + uint32_t n_vocab; + uint32_t n_ctx_train; // context size the model was trained on + uint32_t n_embd; + uint32_t n_head; + uint32_t n_head_kv; + uint32_t n_layer; + uint32_t n_rot; + uint32_t n_ff; + + float f_norm_eps; + float f_norm_rms_eps; + + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_yarn_orig_ctx; + int8_t rope_scaling_type_train : 3; + bool rope_finetuned : 1; + + float f_clamp_kqv; + float f_max_alibi_bias; bool operator!=(const llama_hparams & other) const { - return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT + if (this->vocab_only != other.vocab_only) return true; + if (this->n_vocab != other.n_vocab) return true; + if (this->n_ctx_train != other.n_ctx_train) return true; + if (this->n_embd != other.n_embd) return true; + if (this->n_head != other.n_head) return true; + if (this->n_head_kv != other.n_head_kv) return true; + if (this->n_layer != other.n_layer) return true; + if (this->n_rot != other.n_rot) return true; + if (this->n_ff != other.n_ff) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; + if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; + + const float EPSILON = 1e-9; + + if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; + if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; + if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; + + return false; } uint32_t n_gqa() const { @@ -941,15 +1103,26 @@ struct llama_hparams { uint32_t n_embd_gqa() const { return n_embd/n_gqa(); } +}; - size_t kv_size() const { - size_t result = 2ull; - result *= (size_t) n_embd_gqa(); - result *= (size_t) n_ctx; - result *= (size_t) n_layer; - result *= sizeof(ggml_fp16_t); - return result; - } +struct llama_cparams { + uint32_t n_ctx; // context size used during inference + uint32_t n_batch; + uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads_batch; // number of threads to use for batch processing + + float rope_freq_base; + float rope_freq_scale; + + uint32_t n_yarn_orig_ctx; + // These hyperparameters are not exposed in GGUF, because all + // existing YaRN models use the same values for them. + float yarn_ext_factor; + float yarn_attn_factor; + float yarn_beta_fast; + float yarn_beta_slow; + + bool mul_mat_q; }; struct llama_layer { @@ -958,6 +1131,10 @@ struct llama_layer { struct ggml_tensor * attn_norm_b; struct ggml_tensor * attn_norm_2; struct ggml_tensor * attn_norm_2_b; + struct ggml_tensor * attn_q_norm; + struct ggml_tensor * attn_q_norm_b; + struct ggml_tensor * attn_k_norm; + struct ggml_tensor * attn_k_norm_b; // attention struct ggml_tensor * wq; @@ -966,16 +1143,50 @@ struct llama_layer { struct ggml_tensor * wo; struct ggml_tensor * wqkv; + // attention bias + struct ggml_tensor * bo; + struct ggml_tensor * bqkv; + // normalization struct ggml_tensor * ffn_norm; + struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1; // ffn_gate - struct ggml_tensor * w2; // ffn_down - struct ggml_tensor * w3; // ffn_up + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 + + // ff bias + struct ggml_tensor * ffn_down_b; // b2 + struct ggml_tensor * ffn_up_b; // b3 +}; + +struct llama_kv_cell { + llama_pos pos = -1; + llama_pos delta = 0; + + std::set seq_id; + + bool has_seq_id(const llama_seq_id & id) const { + return seq_id.find(id) != seq_id.end(); + } }; +// ring-buffer of cached KV data struct llama_kv_cache { + bool has_shift = false; + + // Note: The value of head isn't only used to optimize searching + // for a free KV slot. llama_decode_internal also uses it, so it + // cannot be freely changed after a slot has been allocated. + uint32_t head = 0; + uint32_t size = 0; + + // computed before each graph build + uint32_t n = 0; + + std::vector cells; + struct ggml_tensor * k = NULL; struct ggml_tensor * v = NULL; @@ -983,8 +1194,6 @@ struct llama_kv_cache { llama_buffer buf; - int n; // number of tokens currently in the cache - ~llama_kv_cache() { if (ctx) { ggml_free(ctx); @@ -1013,6 +1222,8 @@ struct llama_vocab { std::unordered_map token_to_id; std::vector id_to_token; + std::unordered_map special_tokens_cache; + std::map, int> bpe_ranks; // default LLaMA special tokens @@ -1022,13 +1233,17 @@ struct llama_vocab { id special_sep_id = -1; id special_pad_id = -1; - id linefeed_id = 13; + id linefeed_id = 13; + id special_prefix_id = 32007; + id special_middle_id = 32009; + id special_suffix_id = 32008; + id special_eot_id = 32010; int find_bpe_rank(std::string token_left, std::string token_right) const { - replace_all(token_left, " ", "\u0120"); - replace_all(token_left, "\n", "\u010A"); - replace_all(token_right, " ", "\u0120"); - replace_all(token_right, "\n", "\u010A"); + GGML_ASSERT(token_left.find(" ") == std::string::npos); + GGML_ASSERT(token_left.find("\n") == std::string::npos); + GGML_ASSERT(token_right.find(" ") == std::string::npos); + GGML_ASSERT(token_right.find("\n") == std::string::npos); auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); if (it == bpe_ranks.end()) { @@ -1046,10 +1261,13 @@ struct llama_model { std::string name = "n/a"; - llama_hparams hparams; + llama_hparams hparams = {}; llama_vocab vocab; - struct ggml_tensor * tok_embeddings; + struct ggml_tensor * tok_embd; + struct ggml_tensor * pos_embd; + struct ggml_tensor * tok_norm; + struct ggml_tensor * tok_norm_b; struct ggml_tensor * output_norm; struct ggml_tensor * output_norm_b; @@ -1097,11 +1315,8 @@ struct llama_model { }; struct llama_context { - llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { - if (model_owner) { - delete &model; - } #ifdef GGML_USE_METAL if (ctx_metal) { ggml_metal_free(ctx_metal); @@ -1112,27 +1327,26 @@ struct llama_context { } } + llama_cparams cparams; + + const llama_model & model; + + // key + value cache for the self attention + struct llama_kv_cache kv_self; + std::mt19937 rng; bool has_evaluated_once = false; + int64_t t_start_us; + int64_t t_load_us; int64_t t_sample_us = 0; - int64_t t_eval_us = 0; int64_t t_p_eval_us = 0; + int64_t t_eval_us = 0; int32_t n_sample = 0; // number of tokens sampled - int32_t n_eval = 0; // number of eval calls int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - - const llama_model & model; - - bool model_owner = false; - - int64_t t_load_us; - int64_t t_start_us; - - // key + value cache for the self attention - struct llama_kv_cache kv_self; + int32_t n_eval = 0; // number of eval calls // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; @@ -1167,16 +1381,24 @@ static bool llama_kv_cache_init( const struct llama_hparams & hparams, struct llama_kv_cache & cache, ggml_type wtype, - int n_ctx, + uint32_t n_ctx, int n_gpu_layers) { - const int n_embd = hparams.n_embd_gqa(); - const int n_layer = hparams.n_layer; + const uint32_t n_embd = hparams.n_embd_gqa(); + const uint32_t n_layer = hparams.n_layer; const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - cache.n = 0; + cache.has_shift = false; + + cache.head = 0; + cache.size = n_ctx; + + cache.cells.clear(); + cache.cells.resize(n_ctx); + + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); + memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; params.mem_size = cache.buf.size; @@ -1197,17 +1419,196 @@ static bool llama_kv_cache_init( (void) n_gpu_layers; #ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer + 1) { + size_t vram_kv_cache = 0; + + if (n_gpu_layers > (int)n_layer + 1) { ggml_cuda_assign_buffers_no_scratch(cache.v); + LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); + vram_kv_cache += ggml_nbytes(cache.v); } - if (n_gpu_layers > n_layer + 2) { + if (n_gpu_layers > (int)n_layer + 2) { ggml_cuda_assign_buffers_no_scratch(cache.k); + LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); + vram_kv_cache += ggml_nbytes(cache.k); + } + if (vram_kv_cache > 0) { + LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } #endif // GGML_USE_CUBLAS return true; } +// find an empty slot of size "n_tokens" in the cache +// updates the cache head +// Note: On success, it's important that cache.head points +// to the first cell of the slot. +static bool llama_kv_cache_find_slot( + struct llama_kv_cache & cache, + const struct llama_batch & batch) { + const uint32_t n_ctx = cache.size; + const uint32_t n_tokens = batch.n_tokens; + + if (n_tokens > n_ctx) { + LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); + return false; + } + + uint32_t n_tested = 0; + + while (true) { + if (cache.head + n_tokens > n_ctx) { + n_tested += n_ctx - cache.head; + cache.head = 0; + continue; + } + + bool found = true; + for (uint32_t i = 0; i < n_tokens; i++) { + if (cache.cells[cache.head + i].pos >= 0) { + found = false; + cache.head += i + 1; + n_tested += i + 1; + break; + } + } + + if (found) { + break; + } + + if (n_tested >= n_ctx) { + //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); + return false; + } + } + + for (uint32_t i = 0; i < n_tokens; i++) { + cache.cells[cache.head + i].pos = batch.pos[i]; + + for (int32_t j = 0; j < batch.n_seq_id[i]; j++) { + cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]); + } + } + + return true; +} + +// find how many cells are currently in use +static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { + for (uint32_t i = cache.size - 1; i > 0; --i) { + if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) { + return i + 1; + } + } + + return 0; +} + +static void llama_kv_cache_clear(struct llama_kv_cache & cache) { + for (int32_t i = 0; i < (int32_t) cache.size; ++i) { + cache.cells[i].pos = -1; + cache.cells[i].seq_id.clear(); + } + cache.head = 0; +} + +static void llama_kv_cache_seq_rm( + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + uint32_t new_head = cache.size; + + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + if (seq_id < 0) { + cache.cells[i].seq_id.clear(); + } else if (cache.cells[i].has_seq_id(seq_id)) { + cache.cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cache.cells[i].seq_id.empty()) { + cache.cells[i].pos = -1; + if (new_head == cache.size) new_head = i; + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != cache.size) cache.head = new_head; +} + +static void llama_kv_cache_seq_cp( + struct llama_kv_cache & cache, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + + cache.head = 0; + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + cache.cells[i].seq_id.insert(seq_id_dst); + } + } +} + +static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) { + uint32_t new_head = cache.size; + + for (uint32_t i = 0; i < cache.size; ++i) { + if (!cache.cells[i].has_seq_id(seq_id)) { + cache.cells[i].pos = -1; + cache.cells[i].seq_id.clear(); + if (new_head == cache.size) new_head = i; + } else { + cache.cells[i].seq_id.clear(); + cache.cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != cache.size) cache.head = new_head; +} + +static void llama_kv_cache_seq_shift( + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + uint32_t new_head = cache.size; + + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + cache.has_shift = true; + cache.cells[i].pos += delta; + cache.cells[i].delta += delta; + + if (cache.cells[i].pos < 0) { + cache.cells[i].pos = -1; + cache.cells[i].seq_id.clear(); + if (new_head == cache.size) new_head = i; + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + // Otherwise we just start the next search from the beginning. + cache.head = new_head != cache.size ? new_head : 0; +} + // // model loading and saving // @@ -1215,12 +1616,14 @@ static bool llama_kv_cache_init( enum llama_fver { GGUF_FILE_VERSION_V1 = 1, GGUF_FILE_VERSION_V2 = 2, + GGUF_FILE_VERSION_V3 = 3, }; static const char * llama_file_version_name(llama_fver version) { switch (version) { case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; - case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)"; + case GGUF_FILE_VERSION_V2: return "GGUF V2"; + case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; } return "unknown"; @@ -1250,6 +1653,7 @@ struct llama_model_loader { int n_created = 0; int64_t n_elements = 0; + size_t n_bytes = 0; bool use_mmap = false; @@ -1282,6 +1686,7 @@ struct llama_model_loader { const char * name = gguf_get_tensor_name(ctx_gguf, i); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); n_elements += ggml_nelements(t); + n_bytes += ggml_nbytes(t); } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", @@ -1407,7 +1812,7 @@ struct llama_model_loader { } } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) { + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { if (backend != GGML_BACKEND_CPU) { ggml_set_no_alloc(ctx, true); } @@ -1425,13 +1830,19 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend backend) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend_type backend) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } + if (backend == GGML_BACKEND_GPU_SPLIT) { + if (ne.size() == 1) { + throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); + } + } + { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -1527,7 +1938,7 @@ struct llama_model_loader { lmlock->grow_to(size_lock); } break; -#if defined(GGML_USE_CUBLAS) +#ifdef GGML_USE_CUBLAS case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: // old code: @@ -1560,7 +1971,15 @@ struct llama_model_loader { // load LLaMA models // -std::string llama_model_ftype_name(enum llama_ftype ftype) { +static std::string llama_model_arch_name(llm_arch arch) { + auto it = LLM_ARCH_NAMES.find(arch); + if (it == LLM_ARCH_NAMES.end()) { + return "unknown"; + } + return it->second; +} + +static std::string llama_model_ftype_name(llama_ftype ftype) { if (ftype & LLAMA_FTYPE_GUESSED) { return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; } @@ -1593,9 +2012,12 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_1B: return "1B"; case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; + case MODEL_8B: return "8B"; case MODEL_13B: return "13B"; + case MODEL_15B: return "15B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; case MODEL_40B: return "40B"; @@ -1614,10 +2036,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) { static void llm_load_hparams( llama_model_loader & ml, - llama_model & model, - int n_ctx, - float rope_freq_base, - float rope_freq_scale) { + llama_model & model) { struct gguf_context * ctx = ml.ctx_gguf; const auto kv = LLM_KV(model.arch); @@ -1628,40 +2047,41 @@ static void llm_load_hparams( GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); // get hparams kv - GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); - GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); + GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - // TODO: manually setting rope freq base and scale should override this - // FIXME: partial fix when the param specified is not the default value, but - // will not work for overriding the model value to the params default + hparams.rope_finetuned = false; + GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false, + kv(LLM_KV_ROPE_SCALING_FINETUNED)); - llama_context_params defaults = llama_context_default_params(); + hparams.n_yarn_orig_ctx = hparams.n_ctx_train; + GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, + kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); - // rope_freq_base - { - float ropebase = 10000.0f; - GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) { - rope_freq_base = ropebase; - } - } + // rope_freq_base (optional) + hparams.rope_freq_base_train = 10000.0f; + GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + + std::string rope_scaling("linear"); + GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional - { - float ropescale = 1.0f; + float ropescale = 0.0f; + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); + if (ropescale == 0.0f) { // try the old key name GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) { - rope_freq_scale = 1.0f/ropescale; - } } + hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; // sanity check for n_rot (optional) { @@ -1713,18 +2133,68 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_STARCODER: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + case 36: model.type = e_model::MODEL_3B; break; + case 42: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_15B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_PERSIMMON: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + switch (hparams.n_layer) { + case 36: model.type = e_model::MODEL_8B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_REFACT: + { + GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_1B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_BLOOM: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + case 30: + switch (hparams.n_embd) { + case 2560: model.type = e_model::MODEL_3B; break; + case 4096: model.type = e_model::MODEL_7B; break; + } break; + } + } break; + case LLM_ARCH_MPT: + { + hparams.f_clamp_kqv = 0.0f; + + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV)); + GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS)); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 48: model.type = e_model::MODEL_30B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; - }; + } model.ftype = ml.ftype; - - hparams.n_ctx = n_ctx; - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false); static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( @@ -1741,20 +2211,18 @@ static void llm_load_vocab( throw std::runtime_error("cannot find tokenizer vocab in model file\n"); } + const float * scores = nullptr; const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); - if (score_idx == -1) { - throw std::runtime_error("cannot find tokenizer scores in model file\n"); + if (score_idx != -1) { + scores = (const float * ) gguf_get_arr_data(ctx, score_idx); } - const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - + const int * toktypes = nullptr; const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); - if (toktype_idx == -1) { - throw std::runtime_error("cannot find token type list in GGUF file\n"); + if (toktype_idx != -1) { + toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); } - const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - // determine vocab type { std::string tokenizer_name; @@ -1783,6 +2251,7 @@ static void llm_load_vocab( for (int i = 0; i < n_merges; i++) { const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); + GGML_ASSERT(codepoints_from_utf8(word).size() > 0); std::string first; std::string second; @@ -1817,58 +2286,186 @@ static void llm_load_vocab( for (uint32_t i = 0; i < n_vocab; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); + GGML_ASSERT(codepoints_from_utf8(word).size() > 0); vocab.token_to_id[word] = i; auto & token_data = vocab.id_to_token[i]; token_data.text = std::move(word); - token_data.score = scores[i]; - token_data.type = (llama_token_type) toktypes[i]; + token_data.score = scores ? scores[i] : 0.0f; + token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL; } + GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size()); // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); } else { - vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + const std::vector ids = llama_tokenize_internal(vocab, "\u010A", false); + GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); + vocab.linefeed_id = ids[0]; } // special tokens - GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); - GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID)); - GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); - GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); - GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); -} + { + const std::vector> special_token_types = { + { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, + { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, + { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, + { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, + { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, + }; + for (const auto & it : special_token_types) { + const std::string & key = kv(std::get<0>(it)); + int32_t & id = std::get<1>(it), old_id = id; + + GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key); + // Must be >= -1 and < vocab size. Since the key is unsigned, -1 + // can only come from the default value, so there's no point in + // validating that. + if (size_t(id + 1) > vocab.id_to_token.size()) { + LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n", + __func__, key.c_str(), id, old_id); + id = old_id; + } + } + } -static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { - const auto & hparams = model.hparams; - const auto & vocab = model.vocab; + // build special tokens cache + { + // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type, + // and will always be correctly labeled in 'added_tokens.json' etc. + // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed + // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer + // are special tokens. + // From testing, this appears to corelate 1:1 with special tokens. + // + + // Counting special tokens and verifying in only one direction + // is sufficient to detect difference in those two sets. + // + uint32_t special_tokens_count_by_type = 0; + uint32_t special_tokens_count_from_verification = 0; + + bool special_tokens_definition_mismatch = false; + + for (const auto & t : vocab.token_to_id) { + const auto & token = t.first; + const auto & id = t.second; + + // Count all non-normal tokens in the vocab while iterating + if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) { + special_tokens_count_by_type++; + } - // hparams - LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); - LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); - LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix - LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); - LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); - LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); - LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); - LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); - LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); - LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); - LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); - LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9); + // Skip single character tokens + if (token.length() > 1) { + bool is_tokenizable = false; - // general kv + // Split token string representation in two, in all possible ways + // and check if both halves can be matched to a valid token + for (unsigned i = 1; i < token.length();) { + const auto left = token.substr(0, i); + const auto right = token.substr(i); + + // check if we didnt partition in the middle of a utf sequence + auto utf = utf8_len(left.at(left.length() - 1)); + + if (utf == 1) { + if (vocab.token_to_id.find(left) != vocab.token_to_id.end() && + vocab.token_to_id.find(right) != vocab.token_to_id.end() ) { + is_tokenizable = true; + break; + } + i++; + } else { + // skip over the rest of multibyte utf sequence + i += utf - 1; + } + } + + if (!is_tokenizable) { + // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1 + // it's faster to re-filter them here, since there are way less candidates now + + // Calculate a total "utf" length of a token string representation + size_t utf8_str_len = 0; + for (unsigned i = 0; i < token.length();) { + utf8_str_len++; + i += utf8_len(token.at(i)); + } + + // And skip the ones which are one character + if (utf8_str_len > 1) { + // At this point what we have left are special tokens only + vocab.special_tokens_cache[token] = id; + + // Count manually found special tokens + special_tokens_count_from_verification++; + + // If this manually found special token is not marked as such, flag a mismatch + if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) { + special_tokens_definition_mismatch = true; + } + } + } + } + } + + if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) { + LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n", + __func__, + special_tokens_count_from_verification, vocab.id_to_token.size(), + special_tokens_count_by_type, vocab.id_to_token.size() + ); + } else { + LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n", + __func__, + special_tokens_count_from_verification, vocab.id_to_token.size() + ); + } + } +} + +static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); + + // hparams + LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); + LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); + LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); + LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); + LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); + LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); + LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); + LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); + LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); + LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); + LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); + LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); + LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + if (ml.n_bytes < GB) { + LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } else { + LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } + + // general kv LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); // special tokens @@ -1883,13 +2480,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { static void llm_load_tensors( llama_model_loader & ml, llama_model & model, - int n_batch, int n_gpu_layers, int main_gpu, const float * tensor_split, - const bool mul_mat_q, - bool low_vram, - ggml_type memory_type, bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { @@ -1928,11 +2521,9 @@ static void llm_load_tensors( } (void) main_gpu; - (void) mul_mat_q; -#if defined(GGML_USE_CUBLAS) +#ifdef GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); - ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) @@ -1955,21 +2546,22 @@ static void llm_load_tensors( const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_REFACT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; @@ -1996,8 +2588,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; @@ -2010,32 +2602,32 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; @@ -2062,8 +2654,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; @@ -2076,15 +2668,15 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; @@ -2092,20 +2684,20 @@ static void llm_load_tensors( { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; @@ -2134,8 +2726,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; @@ -2155,40 +2747,321 @@ static void llm_load_tensors( layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + } + } + } break; + case LLM_ARCH_STARCODER: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b); + } + } + } break; + case LLM_ARCH_PERSIMMON: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + const int i_gpu_start = n_layer - n_gpu_layers; + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; + auto & layer = model.layers[i]; + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); + layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); + layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); + layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + } + } break; + case LLM_ARCH_BLOOM: + { + // TODO: CPU-only for now + + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); + model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); + } + } + } break; + case LLM_ARCH_MPT: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + + ggml_nbytes(layer.wqkv) + + ggml_nbytes(layer.wo) + + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_down) + + ggml_nbytes(layer.ffn_up); } } } break; default: throw std::runtime_error("unknown architecture"); - }; + } } ml.done_getting_tensors(); // print memory requirements { - const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; - // this is the total memory required to run the inference size_t mem_required = ctx_size + mmapped_size - vram_weights; // weights in VRAM not in memory - // this is the memory required by one llama_state - const size_t mem_required_state = scale*hparams.kv_size(); - - LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - - (void) n_batch; + LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -2197,36 +3070,17 @@ static void llm_load_tensors( if (n_gpu_layers > (int) hparams.n_layer) { LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); } - size_t vram_kv_cache = 0; #ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; - if (n_gpu_layers > (int) hparams.n_layer + 1) { - if (low_vram) { - LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += hparams.kv_size() / 2; - } - } - if (n_gpu_layers > (int) hparams.n_layer + 2) { - if (low_vram) { - LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += hparams.kv_size() / 2; - } - } -#elif defined(GGML_USE_CLBLAST) + const int max_offloadable_layers = hparams.n_layer + 3; +#elif GGML_USE_CLBLAST const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; #endif // GGML_USE_CUBLAS - LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", - __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n", - __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up + LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); + LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0); #else (void) n_gpu_layers; #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) @@ -2239,7 +3093,7 @@ static void llm_load_tensors( } (void) tensor_split; -#if defined(GGML_USE_CUBLAS) +#ifdef GGML_USE_CUBLAS { ggml_cuda_set_tensor_split(tensor_split); } @@ -2258,46 +3112,31 @@ static void llm_load_tensors( model.t_load_us = ggml_time_us() - model.t_start_us; } -static bool llama_model_load( - const std::string & fname, - llama_model & model, - int n_ctx, - int n_batch, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - const bool mul_mat_q, - float rope_freq_base, - float rope_freq_scale, - bool low_vram, - ggml_type memory_type, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void *progress_callback_user_data) { +static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { - std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); + llama_model_loader ml(fname, params.use_mmap); + + model.hparams.vocab_only = params.vocab_only; - llm_load_arch (*ml, model); - llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale); - llm_load_vocab (*ml, model); + llm_load_arch (ml, model); + llm_load_hparams(ml, model); + llm_load_vocab (ml, model); - llm_load_print_meta(*ml, model); + llm_load_print_meta(ml, model); if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { throw std::runtime_error("vocab size mismatch"); } - if (vocab_only) { + if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); return true; } llm_load_tensors( - *ml, model, n_batch, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, low_vram, memory_type, - use_mlock, progress_callback, progress_callback_user_data); + ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + params.progress_callback, params.progress_callback_user_data + ); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; @@ -2306,1092 +3145,1995 @@ static bool llama_model_load( return true; } -static struct ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { +// +// llm_build +// - GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT +using llm_build_cb = std::function; - const int N = n_tokens; +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; - const auto & model = lctx.model; - const auto & hparams = model.hparams; +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; - const auto & kv_self = lctx.kv_self; +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; - GGML_ASSERT(!!kv_self.ctx); +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; - const int64_t n_embd = hparams.n_embd; +static struct ggml_tensor * llm_build_inp_embd( + struct ggml_context * ctx, + const llama_hparams & hparams, + const llama_batch & batch, + struct ggml_tensor * tok_embd, + const llm_build_cb & cb) { + const int64_t n_embd = hparams.n_embd; + + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + cb(inp_tokens, "inp_tokens", -1); + + inpL = ggml_get_rows(ctx, tok_embd, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); + } + + return inpL; +} + +// Persimmon: n_rot = n_embd_head/2 +// Other: n_rot = n_embd_head +static void llm_build_k_shift( + struct ggml_context * ctx, + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache & kv, + struct ggml_cgraph * graph, + llm_rope_type type, + int64_t n_ctx, + int64_t n_rot, + float freq_base, + float freq_scale, + const llm_build_cb & cb) { const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; - const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_embd_head = hparams.n_embd_head(); + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; + const float ext_factor = cparams.yarn_ext_factor; + const float attn_factor = cparams.yarn_attn_factor; + const float beta_fast = cparams.yarn_beta_fast; + const float beta_slow = cparams.yarn_beta_slow; - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head % n_rot == 0); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float norm_rms_eps = hparams.f_norm_rms_eps; + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift", -1); - const int n_gpu_layers = model.n_gpu_layers; + int rope_type = 0; - auto & buf_compute = lctx.buf_compute; + switch (type) { + case LLM_ROPE: rope_type = 0; break; + case LLM_ROPE_NEOX: rope_type = 2; break; + case LLM_ROPE_GLM: rope_type = 4; break; + } - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx, + ggml_view_3d(ctx, kv.k, + n_rot, n_head_kv, n_ctx, + ggml_element_size(kv.k)*n_embd_head, + ggml_element_size(kv.k)*n_embd_gqa, + ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il), + K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(graph, tmp); + } +} + +static void llm_build_kv_store( + struct ggml_context * ctx, + const llama_hparams & hparams, + const llama_kv_cache & kv, + struct ggml_cgraph * graph, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + int64_t n_ctx, + int32_t n_tokens, + int32_t kv_head, + const llm_build_cb & cb, + int64_t il) { + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); + //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed + cb(v_cur_t, "v_cur_t", il); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa, + (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + cb(k_cache_view, "k_cache_view", il); + + struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv.v), + (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v)); + cb(v_cache_view, "v_cache_view", il); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); +} + +static struct ggml_tensor * llm_build_norm( + struct ggml_context * ctx, + struct ggml_tensor * cur, + const llama_hparams & hparams, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + const llm_build_cb & cb, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break; + } - params.no_alloc = true; + if (mw || mb) { + cb(cur, "norm", il); + } - struct ggml_context * ctx0 = ggml_init(params); + if (mw) { + cur = ggml_mul(ctx, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } - ggml_cgraph * gf = ggml_new_graph(ctx0); + if (mb) { + cur = ggml_add(ctx, cur, mb); + } - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + return cur; +} + +static struct ggml_tensor * llm_build_ffn( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur); + cb(tmp, "ffn_up", il); - if (tokens) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + if (up_b) { + tmp = ggml_add(ctx, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = ggml_mul_mat(ctx, gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = ggml_mul_mat(ctx, gate, cur); + cb(cur, "ffn_gate", il); + } break; } - ggml_set_name(inp_tokens, "inp_tokens"); - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + if (gate_b) { + cur = ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif + cur = tmp; + } - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx, cur); + cb(cur, "ffn_gelu", il); + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); - } + cur = ggml_sqr(ctx, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case ggml_cuda_assign_buffers has no effect - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx, cur, tmp); + cb(cur, "ffn_gate_par", il); + } -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + cur = ggml_mul_mat(ctx, down, cur); + if (down_b) { + cb(cur, "ffn_down", il); } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; + + if (down_b) { + cur = ggml_add(ctx, cur, down_b); } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + + return cur; +} + +// if max_alibi_bias > 0 then apply ALiBi +static struct ggml_tensor * llm_build_kqv( + struct ggml_context * ctx, + const llama_hparams & hparams, + const llama_kv_cache & kv, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, + struct ggml_tensor * kq_scale, + struct ggml_tensor * kq_mask, + int64_t n_ctx, + int32_t n_tokens, + int32_t n_kv, + float max_alibi_bias, + const llm_build_cb & cb, + int il) { + const int64_t n_embd = hparams.n_embd; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); + cb(q, "q", il); + + struct ggml_tensor * k = + ggml_view_3d(ctx, kv.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv.k)*n_embd_gqa, + ggml_element_size(kv.k)*n_embd_head, + ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il); + cb(k, "k", il); + + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); + + kq = ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); + + if (max_alibi_bias > 0.0f) { + // TODO: n_head or n_head_kv + // TODO: K-shift is likely not working + // TODO: change to ggml_add + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); + cb(kq, "kq_scaled_alibi", il); + } + + kq = ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); + + kq = ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx, kv.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv.v)*n_ctx, + ggml_element_size(kv.v)*n_ctx*n_embd_head, + ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); + cb(cur, "kqv_merged_cont", il); + + cur = ggml_mul_mat(ctx, wo, cur); + if (wo_b) { + cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx, cur, wo_b); + } + + return cur; +} + +struct llm_build_context { + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_batch & batch; + const llama_kv_cache & kv_self; + + const int64_t n_embd; + const int64_t n_layer; + const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t n_head; + const int64_t n_head_kv; + const int64_t n_embd_head; + const int64_t n_embd_gqa; + + const float freq_base; + const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; + const float norm_eps; + const float norm_rms_eps; + + const int32_t n_tokens; + const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t kv_head; // index of where we store new KV data in the cache + const int32_t n_orig_ctx; + + const bool do_rope_shift; + + const llm_build_cb & cb; + + llama_buffer & buf_compute; + + struct ggml_context * ctx0 = nullptr; + + // TODO: consider making the entire interface noexcept + llm_build_context( + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) : + model (lctx.model), + hparams (model.hparams), + cparams (lctx.cparams), + batch (batch), + kv_self (lctx.kv_self), + n_embd (hparams.n_embd), + n_layer (hparams.n_layer), + n_ctx (cparams.n_ctx), + n_head (hparams.n_head), + n_head_kv (hparams.n_head_kv), + n_embd_head (hparams.n_embd_head()), + n_embd_gqa (hparams.n_embd_gqa()), + freq_base (cparams.rope_freq_base), + freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), + norm_eps (hparams.f_norm_eps), + norm_rms_eps (hparams.f_norm_rms_eps), + n_tokens (batch.n_tokens), + n_kv (worst_case ? n_ctx : kv_self.n), + kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_orig_ctx (cparams.n_yarn_orig_ctx), + do_rope_shift (worst_case || kv_self.has_shift), + cb (cb), + buf_compute (lctx.buf_compute) { + GGML_ASSERT(!!kv_self.ctx); + + // all initializations should be done in init() + } + + void init() { + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ true, + }; + + ctx0 = ggml_init(params); } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + void free() { + if (ctx0) { + ggml_free(ctx0); + ctx0 = nullptr; + } } - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); + struct ggml_cgraph * build_llama() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); - offload_func_t offload_func = llama_nop; + GGML_ASSERT(n_embd_head == hparams.n_rot); -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); + // input for next layer + inpL = cur; } - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_baichuan() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - // store key and value to memory + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention { - // compute the transposed [N, n_embd] V matrix + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + switch (model.type) { + case MODEL_7B: + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + break; + case MODEL_13B: + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); + break; + default: + GGML_ASSERT(false); + } + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); + // apply ALiBi for 13B model + const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); - offload_func_kq(k); - ggml_set_name(k, "k"); + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); + } - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + struct ggml_cgraph * build_falcon() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; - // feed-forward network - { - // norm + attn_norm = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(attn_norm, "attn_norm", il); + + // self-attention { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); + if (model.layers[il].attn_norm_2) { + // Falcon-40B + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_2, + model.layers[il].attn_norm_2_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm_2", il); + } else { + cur = attn_norm; + } + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + // using mode = 2 for neox mode + Qcur = ggml_rope_custom( + ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); + struct ggml_tensor * ffn_inp = cur; - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); + // feed forward + { + cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } - // SILU activation - cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); - cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); + // input for next layer + inpL = cur; } - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + cur = inpL; - // input for next layer - inpL = cur; - } + // norm + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - cur = inpL; + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); + ggml_build_forward_expand(gf, cur); - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); + return gf; } - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + struct ggml_cgraph * build_starcoder() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); - ggml_build_forward_expand(gf, cur); + struct ggml_tensor * cur; + struct ggml_tensor * pos; + struct ggml_tensor * inpL; - ggml_free(ctx0); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - return gf; -} + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); -static struct ggml_cgraph * llm_build_baichaun( - llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); - const int N = n_tokens; + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); - const auto & model = lctx.model; - const auto & hparams = model.hparams; + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); - const auto & kv_self = lctx.kv_self; + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - GGML_ASSERT(!!kv_self.ctx); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - GGML_ASSERT(n_embd_head == hparams.n_rot); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float norm_rms_eps = hparams.f_norm_rms_eps; + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - const int n_gpu_layers = model.n_gpu_layers; + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - auto & buf_compute = lctx.buf_compute; + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - params.no_alloc = true; + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } - struct ggml_context * ctx0 = ggml_init(params); + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } - ggml_cgraph * gf = ggml_new_graph(ctx0); + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_persimmon() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + const int64_t n_rot = n_embd_head / 2; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - if (tokens) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "imp_embd", -1); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } - ggml_set_name(inp_tokens, "inp_tokens"); - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * residual = inpL; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + // self attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + // split qkv + GGML_ASSERT(n_head_kv == n_head); + + struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); + cb(tmpqkv, "tmpqkv", il); + + struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); + cb(tmpqkv_perm, "tmpqkv", il); + + struct ggml_tensor * tmpq = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + 0 + ); + cb(tmpq, "tmpq", il); + + struct ggml_tensor * tmpk = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens + ); + cb(tmpk, "tmpk", il); + + // Q/K Layernorm + tmpq = llm_build_norm(ctx0, tmpq, hparams, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, cb, il); + cb(tmpq, "tmpq", il); + + tmpk = llm_build_norm(ctx0, tmpk, hparams, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, cb, il); + cb(tmpk, "tmpk", il); + + // RoPE the first n_rot of q/k, pass the other half, and concat. + struct ggml_tensor * qrot = ggml_view_3d( + ctx0, tmpq, n_rot, n_head, n_tokens, + ggml_element_size(tmpq) * n_embd_head, + ggml_element_size(tmpq) * n_embd_head * n_head, + 0 + ); + cb(qrot, "qrot", il); + + struct ggml_tensor * krot = ggml_view_3d( + ctx0, tmpk, n_rot, n_head, n_tokens, + ggml_element_size(tmpk) * n_embd_head, + ggml_element_size(tmpk) * n_embd_head * n_head, + 0 + ); + cb(krot, "krot", il); + + // get the second half of tmpq, e.g tmpq[n_rot:, :, :] + struct ggml_tensor * qpass = ggml_view_3d( + ctx0, tmpq, n_rot, n_head, n_tokens, + ggml_element_size(tmpq) * n_embd_head, + ggml_element_size(tmpq) * n_embd_head * n_head, + ggml_element_size(tmpq) * n_rot + ); + cb(qpass, "qpass", il); + + struct ggml_tensor * kpass = ggml_view_3d( + ctx0, tmpk, n_rot, n_head, n_tokens, + ggml_element_size(tmpk) * n_embd_head, + ggml_element_size(tmpk) * n_embd_head * n_head, + ggml_element_size(tmpk) * n_rot + ); + cb(kpass, "kpass", il); + + struct ggml_tensor * qrotated = ggml_rope_custom( + ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(qrotated, "qrotated", il); + + struct ggml_tensor * krotated = ggml_rope_custom( + ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(krotated, "krotated", il); + + // ggml currently only supports concatenation on dim=2 + // so we need to permute qrot, qpass, concat, then permute back. + qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); + cb(qrotated, "qrotated", il); + + krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); + cb(krotated, "krotated", il); + + qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); + cb(qpass, "qpass", il); + + kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); + cb(kpass, "kpass", il); + + struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); + cb(Q, "Q", il); + + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + ); + cb(Vcur, "Vcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + // TODO: not tested, could be broken + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + inpL = cur; } - } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; + cur = inpL; - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case ggml_cuda_assign_buffers has no effect - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + ggml_build_forward_expand(gf, cur); + + return gf; } - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); + struct ggml_cgraph * build_refact() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); - offload_func_t offload_func = llama_nop; + struct ggml_tensor * cur; + struct ggml_tensor * inpL; -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - struct ggml_tensor * inpSA = inpL; + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); - } + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); - - struct ggml_tensor * Kcur; - struct ggml_tensor * Qcur; - switch (model.type) { - case MODEL_7B: - Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - break; - case MODEL_13B: - Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N); - Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N); - break; - default: - GGML_ASSERT(false); + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_bloom() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - // store key and value to memory + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + inpL = llm_build_norm(ctx0, inpL, hparams, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, cb, -1); + cb(inpL, "inp_norm", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention { - // compute the transposed [N, n_embd] V matrix + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); - offload_func_kq(k); - ggml_set_name(k, "k"); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct ggml_tensor * KQ_masked; - struct ggml_tensor * KQ_scaled_alibi; - - switch (model.type) { - case MODEL_7B: - KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - break; - case MODEL_13B: - KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); - break; - default: - GGML_ASSERT(false); + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } - // KQ_masked = mask_past(KQ_scaled) - // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); - // offload_func_kq(KQ_masked); - // ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + // Add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - // feed-forward network - { - // norm + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_mpt() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + attn_norm = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + NULL, + LLM_NORM, cb, il); + cb(attn_norm, "attn_norm", il); + + // self-attention { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); + cur = attn_norm; + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + if (hparams.f_clamp_kqv > 0.0f) { + cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + } + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); + // Add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); + // feed forward + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + NULL, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } - // SILU activation - cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } - cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); + cur = inpL; - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } +}; + +// +// tensor offloading helpers +// +// TODO: will be removed with backend v2 + +enum llm_offload_func_e { + OFFLOAD_FUNC_NOP, + OFFLOAD_FUNC, + OFFLOAD_FUNC_KQ, + OFFLOAD_FUNC_V, + OFFLOAD_FUNC_NR, + OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_OUT, +}; + +// TODO: will be removed with backend v2 +struct llm_offload_trie { + struct node { + ~node() { + for (int i = 0; i < 256; ++i) { + if (children[i]) { + delete children[i]; + } + } } - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + node * children[256] = { nullptr }; + llm_offload_func_e func = OFFLOAD_FUNC_NOP; + }; - // input for next layer - inpL = cur; + llm_offload_trie() { + root = new node; } - cur = inpL; + llm_offload_trie(const std::unordered_map & map) { + root = new node; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); + for (const auto & kv : map) { + add(kv.first, kv.second); + } + } - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); + ~llm_offload_trie() { + delete root; } - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + void add(const char * name, llm_offload_func_e func) { + node * cur = root; - ggml_build_forward_expand(gf, cur); + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; - ggml_free(ctx0); + if (!c) { + break; + } - return gf; -} + if (!cur->children[c]) { + cur->children[c] = new node; + } -static struct ggml_cgraph * llm_build_falcon( - llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { + cur = cur->children[c]; + } - GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + cur->func = func; + } - const int N = n_tokens; + llm_offload_func_e find(const char * name) const { + const node * cur = root; - const auto & model = lctx.model; - const auto & hparams = model.hparams; + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; - const auto & kv_self = lctx.kv_self; + if (!c) { + break; + } - GGML_ASSERT(!!kv_self.ctx); + if (!cur->children[c]) { + return OFFLOAD_FUNC_NOP; + } - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + cur = cur->children[c]; + } + + return cur->func; + } - GGML_ASSERT(n_embd_head == hparams.n_rot); + node * root = nullptr; +}; - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; +// TODO: will be removed with backend v2 +static const std::unordered_map k_offload_map = { + //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + { "pos_embd", OFFLOAD_FUNC_NR }, + + { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope) + { "KQ_scale", OFFLOAD_FUNC_KQ }, + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, + + { "inp_norm", OFFLOAD_FUNC_NR }, + { "inp_norm_w", OFFLOAD_FUNC_NR }, + { "inp_norm_wb", OFFLOAD_FUNC_NR }, + + { "norm", OFFLOAD_FUNC }, + { "norm_w", OFFLOAD_FUNC }, + { "norm_wb", OFFLOAD_FUNC }, + + { "attn_norm", OFFLOAD_FUNC }, + { "attn_norm_2", OFFLOAD_FUNC }, + + { "wqkv", OFFLOAD_FUNC_KQ }, + { "bqkv", OFFLOAD_FUNC_KQ }, + { "wqkv_clamped", OFFLOAD_FUNC_KQ }, + + { "tmpk", OFFLOAD_FUNC_KQ }, + { "tmpq", OFFLOAD_FUNC_KQ }, + { "tmpv", OFFLOAD_FUNC_V }, + { "Kcur", OFFLOAD_FUNC_KQ }, + { "Qcur", OFFLOAD_FUNC_KQ }, + { "Vcur", OFFLOAD_FUNC_V }, + + { "krot", OFFLOAD_FUNC_KQ }, + { "qrot", OFFLOAD_FUNC_KQ }, + { "kpass", OFFLOAD_FUNC_KQ }, + { "qpass", OFFLOAD_FUNC_KQ }, + { "krotated", OFFLOAD_FUNC_KQ }, + { "qrotated", OFFLOAD_FUNC_KQ }, + + { "q", OFFLOAD_FUNC_KQ }, + { "k", OFFLOAD_FUNC_KQ }, + { "kq", OFFLOAD_FUNC_KQ }, + { "kq_scaled", OFFLOAD_FUNC_KQ }, + { "kq_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "kq_masked", OFFLOAD_FUNC_KQ }, + { "kq_soft_max", OFFLOAD_FUNC_V }, + { "v", OFFLOAD_FUNC_V }, + { "kqv", OFFLOAD_FUNC_V }, + { "kqv_merged", OFFLOAD_FUNC_V }, + { "kqv_merged_cont", OFFLOAD_FUNC_V }, + { "kqv_wo", OFFLOAD_FUNC_V }, + { "kqv_out", OFFLOAD_FUNC_V }, + + { "ffn_inp", OFFLOAD_FUNC }, + { "ffn_norm", OFFLOAD_FUNC }, + + { "ffn_up", OFFLOAD_FUNC }, + { "ffn_up_b", OFFLOAD_FUNC }, + { "ffn_gate", OFFLOAD_FUNC }, + { "ffn_gate_b", OFFLOAD_FUNC }, + { "ffn_gate_par", OFFLOAD_FUNC }, + { "ffn_down", OFFLOAD_FUNC }, + { "ffn_down_b", OFFLOAD_FUNC }, + { "ffn_out", OFFLOAD_FUNC }, + + { "ffn_silu", OFFLOAD_FUNC }, + { "ffn_gelu", OFFLOAD_FUNC }, + { "ffn_relu", OFFLOAD_FUNC }, + { "ffn_sqr(relu)", OFFLOAD_FUNC }, + + { "l_out", OFFLOAD_FUNC }, + + { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output", OFFLOAD_FUNC_OUT }, +}; - const int n_gpu_layers = model.n_gpu_layers; +static llm_offload_trie k_offload_func_trie(k_offload_map); - auto & buf_compute = lctx.buf_compute; +static struct ggml_cgraph * llama_build_graph( + llama_context & lctx, + const llama_batch & batch) { + const auto & model = lctx.model; - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; + // check if we should build the worst-case graph (for memory measurement) + const bool worst_case = ggml_allocr_is_measure(lctx.alloc); - params.no_alloc = true; + // keep track of the input that has already been allocated + bool alloc_inp_tokens = false; + bool alloc_inp_embd = false; + bool alloc_inp_pos = false; + bool alloc_inp_KQ_scale = false; + bool alloc_inp_KQ_mask = false; + bool alloc_inp_K_shift = false; - struct ggml_context * ctx0 = ggml_init(params); +#ifdef GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = true; // TODO: set to false after finishing refactoring +#endif - ggml_cgraph * gf = ggml_new_graph(ctx0); + int n_non_view = 0; // number of non-view tensors that have been processed by the callback - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + // TODO: will be removed with backend v2 + llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); + } - if (tokens) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + // + // allocate input tensors and set input data + // + // TODO: will be removed with backend v2 - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); + if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif + if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { + const int64_t n_tokens = cur->ne[0]; - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + } - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + alloc_inp_tokens = true; } - } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case ggml_cuda_assign_buffers has no effect - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) { + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + } - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + alloc_inp_embd = true; + } - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; + if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); - offload_func_t offload_func = llama_nop; + if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { + const int64_t n_tokens = cur->ne[0]; -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + int32_t * data = (int32_t *) cur->data; - // self-attention - // TODO: refactor into common function (shared with LLaMA) - { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); - - attn_norm = ggml_add(ctx0, - ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), - model.layers[il].attn_norm_b); - offload_func(attn_norm->src[0]); - offload_func(attn_norm); - - if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), - model.layers[il].attn_norm_2_b); - offload_func(cur->src[0]); - offload_func(cur); - } else { // Falcon 7B - cur = attn_norm; + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } } - // compute QKV - - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); - - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. - - const size_t wsize = ggml_type_size(cur->type); - - // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for - // non-contiguous views is added for the rope operator - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0)); - offload_func_kq(tmpq); - - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head)); - offload_func_kq(tmpk); - - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_v(tmpv); - - // using mode = 2 for neox mode - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Kcur); + alloc_inp_pos = true; + } - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); - offload_func_v(v); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = model.hparams.n_embd_head(); + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); } - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); + alloc_inp_KQ_scale = true; + } + + if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_kv = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); + float * data = (float *) cur->data; + memset(data, 0, ggml_nbytes(cur)); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + for (int i = 0; i < n_kv; ++i) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + alloc_inp_KQ_mask = true; + } - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + int32_t * data = (int32_t *) cur->data; - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } - cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + alloc_inp_K_shift = true; + } - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + // view tensors are not processed further + if (cur->view_src != nullptr) { + return; } - struct ggml_tensor * attn_out = cur; + if (cur->op != GGML_OP_NONE) { + n_non_view++; + } - // feed forward - { - struct ggml_tensor * inpFF = attn_norm; + // + // offload layers + // + // TODO: will be removed with backend v2 - cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - offload_func(cur); +//#define LLAMA_OFFLOAD_DEBUG - cur = ggml_gelu(ctx0, cur); - offload_func(cur); - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + if (!do_offload) { + return; } - cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + const int n_layer = model.hparams.n_layer; - // input for next layer - inpL = cur; - } + const int n_gpu_layers = model.n_gpu_layers; + const int i_gpu_start = n_layer - n_gpu_layers; - cur = inpL; + // should we offload the final norm? yes if we are not computing embeddings + const bool offload_emb = lctx.embedding.empty(); - // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + static const std::unordered_map> k_offload_func_name = { + { OFFLOAD_FUNC_NOP, "CPU" }, + { OFFLOAD_FUNC_OUT, "CPU" }, +#ifdef GGML_USE_CUBLAS + { OFFLOAD_FUNC, "GPU (CUDA)" }, + { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, + { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, + { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, + { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, +#else + { OFFLOAD_FUNC, "CPU" }, + { OFFLOAD_FUNC_KQ, "CPU" }, + { OFFLOAD_FUNC_V, "CPU" }, + { OFFLOAD_FUNC_NR, "CPU" }, + { OFFLOAD_FUNC_EMB, "CPU" }, +#endif // GGML_USE_CUBLAS + }; - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.output_norm), - model.output_norm_b); - ggml_set_name(cur, "result_norm"); - } + // check the global map for what offload function to use for this tensor + llm_offload_func_e func_e = k_offload_func_trie.find(name); + + if (func_e == OFFLOAD_FUNC_NOP) { +#ifdef LLAMA_OFFLOAD_DEBUG + // if a tensor hasn't been offloaded, we warn the user + if (worst_case) { + LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, + cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); + } +#endif + + return; + } - cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + // count the number of layers and respect the provided n_gpu_layers + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: + break; + case OFFLOAD_FUNC: + if (n_gpu_layers < n_layer) { + if (il < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } + } + break; + case OFFLOAD_FUNC_NR: + if (n_gpu_layers <= n_layer + 0) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_V: + if (n_gpu_layers <= n_layer + 1) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_KQ: + if (n_gpu_layers <= n_layer + 2) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_EMB: + if (!offload_emb || n_gpu_layers < n_layer) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + default: GGML_ASSERT(false); + } - ggml_build_forward_expand(gf, cur); + offload_func_t func = ggml_offload_nop; - ggml_free(ctx0); + // this is needed for compatibility with Metal for example +#ifdef GGML_USE_CUBLAS + static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; +#else + static offload_func_t ggml_offload_gpu = ggml_offload_nop; +#endif - return gf; -} + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; + case OFFLOAD_FUNC: + case OFFLOAD_FUNC_KQ: + case OFFLOAD_FUNC_V: + case OFFLOAD_FUNC_NR: + case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; + default: GGML_ASSERT(false); + } -static struct ggml_cgraph * llama_build_graph( - llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { - const auto & model = lctx.model; + // apply offload function to the tensor + func(cur); + +#ifdef LLAMA_OFFLOAD_DEBUG + if (worst_case) { + LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); + } +#endif + }; struct ggml_cgraph * result = NULL; + struct llm_build_context llm(lctx, batch, cb, worst_case); + + llm.init(); + switch (model.arch) { case LLM_ARCH_LLAMA: { - result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past); + result = llm.build_llama(); } break; case LLM_ARCH_BAICHUAN: { - result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past); + result = llm.build_baichuan(); } break; case LLM_ARCH_FALCON: { - result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past); + result = llm.build_falcon(); + } break; + case LLM_ARCH_STARCODER: + { + result = llm.build_starcoder(); + } break; + case LLM_ARCH_PERSIMMON: + { + result = llm.build_persimmon(); + } break; + case LLM_ARCH_REFACT: + { + result = llm.build_refact(); + } break; + case LLM_ARCH_BLOOM: + { + result = llm.build_bloom(); + } break; + case LLM_ARCH_MPT: + { + result = llm.build_mpt(); } break; default: GGML_ASSERT(false); - }; + } + + llm.free(); + + if (worst_case) { + int n_non_view_total = 0; + + for (int i = 0; i < result->n_nodes; ++i) { + if (result->nodes[i]->view_src == nullptr) { + n_non_view_total++; + } + } + + LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); + + if (n_non_view != n_non_view_total) { + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); + LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); + LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); + LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + } + } return result; } -// evaluate the transformer +// decode a batch of tokens by evaluating the transformer // // - lctx: llama context -// - tokens: new batch of tokens to process -// - embd embeddings input -// - n_tokens number of tokens -// - n_past: the context size so far -// - n_threads: number of threads to use +// - batch: batch to evaluate // -static bool llama_eval_internal( +// return 0 on success +// return positive int on warning +// return negative int on error +// +static int llama_decode_internal( llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past, - int n_threads, - const char * cgraph_fname) { + llama_batch batch) { + const uint32_t n_tokens = batch.n_tokens; + + if (n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); + return -1; + } + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; - GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + const auto n_batch = cparams.n_batch; - GGML_ASSERT(n_tokens > 0); - GGML_ASSERT(n_past >= 0); - // TODO: keep the values of n_batch and n_ctx - // GGML_ASSERT(n_tokens <= n_batch); - // GGML_ASSERT(n_past + n_tokens <= n_ctx); + GGML_ASSERT(n_tokens <= n_batch); + + int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT const int64_t t_start_us = ggml_time_us(); #ifdef GGML_USE_MPI - ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); + // TODO: needs fix after #3228 + GGML_ASSERT(false && "not implemented"); + //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); #endif GGML_ASSERT(n_threads > 0); - const int N = n_tokens; - - const auto & model = lctx.model; - const auto & hparams = model.hparams; - - const auto & kv_self = lctx.kv_self; + auto & kv_self = lctx.kv_self; GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; + // helpers for smoother batch API transistion + // after deprecating the llama_eval calls, these will be removed + std::vector pos; + + std::vector n_seq_id; + std::vector seq_id_arr; + std::vector> seq_id; + + if (batch.pos == nullptr) { + pos.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + pos[i] = batch.all_pos_0 + i*batch.all_pos_1; + } + + batch.pos = pos.data(); + } + + if (batch.seq_id == nullptr) { + n_seq_id.resize(n_tokens); + seq_id.resize(n_tokens); + seq_id_arr.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + n_seq_id[i] = 1; + seq_id[i].resize(1); + seq_id[i][0] = batch.all_seq_id; + seq_id_arr[i] = seq_id[i].data(); + } + + batch.n_seq_id = n_seq_id.data(); + batch.seq_id = seq_id_arr.data(); + } + + if (!llama_kv_cache_find_slot(kv_self, batch)) { + return 1; + } + + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA? + kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); + + //printf("kv_self.n = %d\n", kv_self.n); + ggml_allocr_reset(lctx.alloc); - ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); + ggml_cgraph * gf = llama_build_graph(lctx, batch); ggml_allocr_alloc_graph(lctx.alloc, gf); + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + + GGML_ASSERT(strcmp(res->name, "result_output") == 0); + GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + + #ifdef GGML_USE_CUBLAS for (int i = 0; i < gf->n_leafs; i++) { ggml_tensor * node = gf->leafs[i]; if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + ggml_cuda_copy_to_device(node); } } @@ -3401,6 +5143,12 @@ static bool llama_eval_internal( ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); } } + + // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed + if (!lctx.embedding.empty()) { + embeddings->backend = GGML_BACKEND_CPU; + } + res->backend = GGML_BACKEND_CPU; #endif // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -3410,15 +5158,22 @@ static bool llama_eval_internal( // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering // with the BLAS calls. need a better solution - if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { n_threads = std::min(4, n_threads); } - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + // If all tensors can be run on the GPU then using more than 1 thread is detrimental. + const bool full_offload_supported = + model.arch == LLM_ARCH_LLAMA || + model.arch == LLM_ARCH_BAICHUAN || + model.arch == LLM_ARCH_FALCON || + model.arch == LLM_ARCH_REFACT || + model.arch == LLM_ARCH_MPT; - GGML_ASSERT(strcmp(res->name, "result_output") == 0); - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; + if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { + n_threads = 1; + } #if GGML_USE_MPI const int64_t n_layer = hparams.n_layer; @@ -3440,11 +5195,21 @@ static bool llama_eval_internal( ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); #endif - // update kv token count - lctx.kv_self.n = n_past + N; + // update the kv ring buffer + { + if (kv_self.has_shift) { + kv_self.has_shift = false; + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } + } + + kv_self.head += n_tokens; - if (cgraph_fname) { - ggml_graph_export(gf, cgraph_fname); + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } } #ifdef GGML_PERF @@ -3459,16 +5224,25 @@ static bool llama_eval_internal( //} // extract logits + // TODO: do not compute and extract logits if only embeddings are needed + // need to update the graphs to skip "result_output" { auto & logits_out = lctx.logits; - if (lctx.logits_all) { - logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); + if (batch.logits) { + logits_out.resize(n_vocab * n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + if (batch.logits[i] == 0) { + continue; + } + memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); + } + } else if (lctx.logits_all) { + logits_out.resize(n_vocab * n_tokens); + memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); } else { - // return result for just the last token logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); } } @@ -3477,20 +5251,27 @@ static bool llama_eval_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd); } // measure the performance only for the single-token evals - if (N == 1) { + if (n_tokens == 1) { lctx.t_eval_us += ggml_time_us() - t_start_us; lctx.n_eval++; } - else if (N > 1) { + else if (n_tokens > 1) { lctx.t_p_eval_us += ggml_time_us() - t_start_us; - lctx.n_p_eval += N; + lctx.n_p_eval += n_tokens; } - return true; + // get a more accurate load time, upon first eval + // TODO: fix this + if (!lctx.has_evaluated_once) { + lctx.t_load_us = ggml_time_us() - lctx.t_start_us; + lctx.has_evaluated_once = true; + } + + return 0; } // @@ -3517,18 +5298,40 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; } -static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { +static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { + return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; +} + +static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(llama_is_byte_token(vocab, id)); const auto& token_data = vocab.id_to_token.at(id); - auto buf = token_data.text.substr(3, 2); - return strtol(buf.c_str(), NULL, 16); + switch (llama_vocab_get_type(vocab)) { + case LLAMA_VOCAB_TYPE_SPM: { + auto buf = token_data.text.substr(3, 2); + return strtol(buf.c_str(), NULL, 16); + } + case LLAMA_VOCAB_TYPE_BPE: { + GGML_ASSERT(false); + return unicode_to_bytes_bpe(token_data.text); + } + default: + GGML_ASSERT(false); + } } static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { - char buf[7]; - int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch); - GGML_ASSERT(0 <= result && result < 7); - return vocab.token_to_id.at(buf); + static const char * hex = "0123456789ABCDEF"; + switch (llama_vocab_get_type(vocab)) { + case LLAMA_VOCAB_TYPE_SPM: { + const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; + return vocab.token_to_id.at(buf); + } + case LLAMA_VOCAB_TYPE_BPE: { + return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); + } + default: + GGML_ASSERT(false); + } } static void llama_escape_whitespace(std::string & text) { @@ -3734,7 +5537,6 @@ struct llm_tokenizer_bpe { llm_symbol sym; size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset])); sym.text = word.c_str() + offset; - sym.n = 1; sym.n = char_len; offset += sym.n; sym.prev = index - 1; @@ -3808,15 +5610,9 @@ struct llm_tokenizer_bpe { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str); if (token_multibyte == vocab.token_to_id.end()) { - try { - llama_token token_byte = llama_byte_to_token(vocab, *j); - output.push_back(token_byte); - } catch (const std::out_of_range & err) { - fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); - } - } else { - output.push_back((*token_multibyte).second); + throw std::runtime_error("ERROR: byte not found in vocab"); } + output.push_back((*token_multibyte).second); } } else { output.push_back((*token).second); @@ -3853,23 +5649,143 @@ private: work_queue.push(bigram); } - // probably not 100% correct - static std::vector bpe_gpt2_preprocess(const std::string & text) { - std::vector words; + std::vector bpe_gpt2_preprocess(const std::string & text) { + std::vector bpe_words; + std::vector bpe_encoded_words; + + std::string token = ""; + // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ + bool collecting_numeric = false; + bool collecting_letter = false; + bool collecting_special = false; + bool collecting_whitespace_lookahead = false; + bool collecting = false; + + std::vector text_utf; + text_utf.reserve(text.size()); + bpe_words.reserve(text.size()); + bpe_encoded_words.reserve(text.size()); + + auto cps = codepoints_from_utf8(text); + for (size_t i = 0; i < cps.size(); ++i) + text_utf.emplace_back(codepoint_to_utf8(cps[i])); + + for (int i = 0; i < (int)text_utf.size(); i++) { + const std::string & utf_char = text_utf[i]; + bool split_condition = false; + int bytes_remain = text_utf.size() - i; + // forward backward lookups + const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; + const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; + + // handling contractions + if (!split_condition && bytes_remain >= 2) { + // 's|'t|'m|'d + if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { + split_condition = true; + } + if (split_condition) { + if (token.size()) { + bpe_words.emplace_back(token); // push previous content as token + } + token = utf_char + utf_char_next; + bpe_words.emplace_back(token); + token = ""; + i++; + continue; + } + } + if (!split_condition && bytes_remain >= 3) { + // 're|'ve|'ll + if (utf_char == "\'" && ( + (utf_char_next == "r" && utf_char_next_next == "e") || + (utf_char_next == "v" && utf_char_next_next == "e") || + (utf_char_next == "l" && utf_char_next_next == "l")) + ) { + split_condition = true; + } + if (split_condition) { + // current token + next token can be defined + if (token.size()) { + bpe_words.emplace_back(token); // push previous content as token + } + token = utf_char + utf_char_next + utf_char_next_next; + bpe_words.emplace_back(token); // the contraction + token = ""; + i += 2; + continue; + } + } + + if (!split_condition && !collecting) { + if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { + collecting_letter = true; + collecting = true; + } + else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + collecting_numeric = true; + collecting = true; + } + else if ( + ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || + (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) + ) { + collecting_special = true; + collecting = true; + } + else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { + collecting_whitespace_lookahead = true; + collecting = true; + } + else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { + split_condition = true; + } + } + else if (!split_condition && collecting) { + if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) { + split_condition = true; + } + else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) { + split_condition = true; + } + else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { + split_condition = true; + } + else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + split_condition = true; + } + } + + if (utf_char_next == "") { + split_condition = true; // final + token += utf_char; + } - // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 - const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; - const std::regex re(pattern); + if (split_condition) { + if (token.size()) { + bpe_words.emplace_back(token); + } + token = utf_char; + collecting = false; + collecting_letter = false; + collecting_numeric = false; + collecting_special = false; + collecting_whitespace_lookahead = false; + } + else { + token += utf_char; + } + } - auto words_begin = std::sregex_iterator(text.begin(), text.end(), re); - auto words_end = std::sregex_iterator(); - auto n_words = std::distance(words_begin, words_end); - words.reserve(n_words); - for (auto it = words_begin; it != words_end; ++it) { - words.push_back(it->str()); + for (std::string & word : bpe_words) { + std::string encoded_token = ""; + for (char & c : word) { + encoded_token += bytes_to_unicode_bpe(c); + } + bpe_encoded_words.emplace_back(encoded_token); } - return words; + return bpe_encoded_words; } const llama_vocab & vocab; @@ -3880,7 +5796,137 @@ private: llm_bigram_bpe::queue work_queue; }; -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) { +typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{ + FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, + FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT +} FRAGMENT_BUFFER_VARIANT_TYPE; + +struct fragment_buffer_variant{ + fragment_buffer_variant(llama_vocab::id _token) + : + type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), + token(_token), + raw_text(_dummy), + offset(0), + length(0){} + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) + : + type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), + token((llama_vocab::id)-1), + raw_text(_raw_text), + offset(_offset), + length(_length){ + GGML_ASSERT( _offset >= 0 ); + GGML_ASSERT( _length >= 1 ); + GGML_ASSERT( offset + length <= raw_text.length() ); + } + + const FRAGMENT_BUFFER_VARIANT_TYPE type; + const llama_vocab::id token; + const std::string _dummy; + const std::string & raw_text; + const uint64_t offset; + const uint64_t length; +}; + +// #define PRETOKENIZERDEBUG + +static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) +{ + // for each special token + for (const auto & st: vocab.special_tokens_cache) { + const auto & special_token = st.first; + const auto & special_id = st.second; + + // for each text fragment + std::forward_list::iterator it = buffer.begin(); + while (it != buffer.end()) { + auto & fragment = (*it); + + // if a fragment is text ( not yet processed ) + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { + auto * raw_text = &(fragment.raw_text); + + auto raw_text_base_offset = fragment.offset; + auto raw_text_base_length = fragment.length; + + // loop over the text + while (true) { + // find the first occurence of a given special token in this fragment + // passing offset argument only limit the "search area" but match coordinates + // are still relative to the source full raw_text + auto match = raw_text->find(special_token, raw_text_base_offset); + + // no occurences found, stop processing this fragment for a given special token + if (match == std::string::npos) break; + + // check if match is within bounds of offset <-> length + if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break; + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); +#endif + auto source = std::distance(buffer.begin(), it); + + // if match is further than base offset + // then we have some text to the left of it + if (match > raw_text_base_offset) { + // left + const int64_t left_reminder_offset = raw_text_base_offset + 0; + const int64_t left_reminder_length = match - raw_text_base_offset; + buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); +#endif + it++; + } + + // special token + buffer.emplace_after(it, special_id); + it++; + + // right + if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { + const int64_t right_reminder_offset = match + special_token.length(); + const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); + buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); +#endif + + it++; + + if (source == 0) { + buffer.erase_after(buffer.before_begin()); + } else { + buffer.erase_after(std::next(buffer.begin(), (source-1))); + } + + // repeat for the right side + raw_text_base_offset = right_reminder_offset; + raw_text_base_length = right_reminder_length; + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); +#endif + } else { + if (source == 0) { + buffer.erase_after(buffer.before_begin()); + } else { + buffer.erase_after(std::next(buffer.begin(), (source-1))); + } + break; + } + } + } + it++; + } + } +} + +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) { std::vector output; // OG tokenizer behavior: @@ -3896,22 +5942,60 @@ static std::vector llama_tokenize_internal(const llama_vocab & return output; } + std::forward_list fragment_buffer; + fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); + + if (special) tokenizer_st_partition( vocab, fragment_buffer ); + switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { - // without adding this leading whitespace, we do not get the same results as the original tokenizer - raw_text = " " + raw_text; + for (const auto & fragment: fragment_buffer) + { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) + { + // without adding this leading whitespace, we do not get the same results as the original tokenizer + + // TODO: It's likely possible to get rid of this string copy entirely + // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer + // and passing 'add space prefix' as bool argument + // + auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length); - llm_tokenizer_spm tokenizer(vocab); - llama_escape_whitespace(raw_text); - tokenizer.tokenize(raw_text, output); +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif + llm_tokenizer_spm tokenizer(vocab); + llama_escape_whitespace(raw_text); + tokenizer.tokenize(raw_text, output); + } + else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + { + output.push_back(fragment.token); + } + } } break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe tokenizer(vocab); - tokenizer.tokenize(raw_text, output); + for (const auto & fragment: fragment_buffer) + { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) + { + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif + llm_tokenizer_bpe tokenizer(vocab); + tokenizer.tokenize(raw_text, output); + } + else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + { + output.push_back(fragment.token); + } + } } break; - }; + } return output; } @@ -3941,7 +6025,7 @@ struct llama_grammar_candidate { // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. -std::pair, llama_partial_utf8> decode_utf8( +static std::pair, llama_partial_utf8> decode_utf8( const char * src, llama_partial_utf8 partial_start) { static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; @@ -4182,7 +6266,7 @@ static std::vector llama_grammar_reject_candidates_for_ std::vector rejects; if (stack.empty()) { - for (auto tok : candidates) { + for (const auto & tok : candidates) { if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) { rejects.push_back(tok); } @@ -4193,7 +6277,7 @@ static std::vector llama_grammar_reject_candidates_for_ const llama_grammar_element * stack_pos = stack.back(); std::vector next_candidates; - for (auto tok : candidates) { + for (const auto & tok : candidates) { if (*tok.code_points == 0) { // reached end of full codepoints in token, reject iff it ended in a partial sequence // that cannot satisfy this position in grammar @@ -4219,7 +6303,7 @@ static std::vector llama_grammar_reject_candidates_for_ llama_grammar_advance_stack(rules, stack_after, next_stacks); auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates); - for (auto tok : next_rejects) { + for (const auto & tok : next_rejects) { rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 }); } @@ -4315,6 +6399,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) // sampling // +void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + seed = time(NULL); + } + ctx->rng.seed(seed); +} + void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { GGML_ASSERT(candidates->size > 0); @@ -4401,6 +6492,32 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } } +void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + if (p <= 0.0f || !candidates->size) { + return; + } + + llama_sample_softmax(ctx, candidates); + + const int64_t t_start_sample_us = ggml_time_us(); + + float scale = candidates->data[0].p; // scale by max prob + size_t i = 1; // first token always matches + + for (; i < candidates->size; ++i) { + if (candidates->data[i].p < p * scale && i >= min_keep) { + break; // prob too small + } + } + + // Resize the output vector to keep only the matching tokens + candidates->size = i; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { if (z >= 1.0f || candidates->size <= 2) { return; @@ -4523,7 +6640,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } -void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { +void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); for (size_t i = 0; i < candidates_p->size; ++i) { @@ -4535,37 +6652,19 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array } } -void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { - if (last_tokens_size == 0 || penalty == 1.0f) { - return; - } - - const int64_t t_start_sample_us = ggml_time_us(); - - for (size_t i = 0; i < candidates->size; ++i) { - const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); - if (token_iter == last_tokens + last_tokens_size) { - continue; - } - - // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. - // This is common fix for this problem, which is to multiply by the penalty instead of dividing. - if (candidates->data[i].logit <= 0) { - candidates->data[i].logit *= penalty; - } else { - candidates->data[i].logit /= penalty; - } - } - - candidates->sorted = false; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } +void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { + llama_sample_temp(ctx, candidates_p, temp); } -void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { - if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { +void llama_sample_repetition_penalties( + struct llama_context * ctx, + llama_token_data_array * candidates, + const llama_token * last_tokens, + size_t penalty_last_n, + float penalty_repeat, + float penalty_freq, + float penalty_present) { + if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) { return; } @@ -4573,19 +6672,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l // Create a frequency map to count occurrences of each token in last_tokens std::unordered_map token_count; - for (size_t i = 0; i < last_tokens_size; ++i) { - token_count[last_tokens_p[i]]++; + for (size_t i = 0; i < penalty_last_n; ++i) { + token_count[last_tokens[i]]++; } // Apply frequency and presence penalties to the candidates for (size_t i = 0; i < candidates->size; ++i) { - auto token_iter = token_count.find(candidates->data[i].id); + const auto token_iter = token_count.find(candidates->data[i].id); if (token_iter == token_count.end()) { continue; } - int count = token_iter->second; - candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence; + const int count = token_iter->second; + + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. + // This is common fix for this problem, which is to multiply by the penalty instead of dividing. + if (candidates->data[i].logit <= 0) { + candidates->data[i].logit *= penalty_repeat; + } else { + candidates->data[i].logit /= penalty_repeat; + } + + candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present; } candidates->sorted = false; @@ -4607,14 +6715,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - const llama_token eos = llama_token_eos(ctx); + const llama_token eos = llama_token_eos(&ctx->model); std::vector, llama_partial_utf8>> candidates_decoded; std::vector candidates_grammar; for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const std::string piece = llama_token_to_str(ctx, id); + const std::string piece = llama_token_to_piece(ctx, id); if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; @@ -4658,7 +6766,7 @@ void llama_sample_classifier_free_guidance( GGML_ASSERT(ctx); - auto n_vocab = llama_n_vocab(ctx); + auto n_vocab = llama_n_vocab(llama_get_model(ctx)); GGML_ASSERT(n_vocab == (int)candidates->size); GGML_ASSERT(!candidates->sorted); @@ -4687,7 +6795,7 @@ void llama_sample_classifier_free_guidance( llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { GGML_ASSERT(ctx); - auto N = float(llama_n_vocab(ctx)); + auto N = float(llama_n_vocab(llama_get_model(ctx))); int64_t t_start_sample_us; t_start_sample_us = ggml_time_us(); @@ -4817,7 +6925,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { const int64_t t_start_sample_us = ggml_time_us(); - if (token == llama_token_eos(ctx)) { + if (token == llama_token_eos(&ctx->model)) { for (const auto & stack : grammar->stacks) { if (stack.empty()) { return; @@ -4826,7 +6934,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string piece = llama_token_to_str(ctx, token); + const std::string piece = llama_token_to_piece(ctx, token); // Note terminating 0 in decoded string const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); @@ -4874,7 +6982,7 @@ struct llama_logit_info { }; llama_logit_info(llama_context * ctx) : logits(llama_get_logits(ctx)) - , n_vocab(llama_n_vocab(ctx)) + , n_vocab(llama_n_vocab(llama_get_model(ctx))) , max_l(*std::max_element(logits, logits + n_vocab)) , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l})) { } @@ -4912,7 +7020,6 @@ struct llama_beam_search_data { size_t n_beams; int n_past; int n_predict; - int n_threads; std::vector beams; std::vector next_beams; @@ -4922,12 +7029,11 @@ struct llama_beam_search_data { // Used to communicate to/from callback on beams state. std::vector beam_views; - llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads) + llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict) : ctx(ctx) , n_beams(n_beams) , n_past(n_past) , n_predict(n_predict) - , n_threads(n_threads) , beam_views(n_beams) { beams.reserve(n_beams); next_beams.reserve(n_beams); @@ -4964,7 +7070,7 @@ struct llama_beam_search_data { } else { // beam is not at end-of-sentence, so branch with next top_k tokens. if (!beam.tokens.empty()) { - llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads); + llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0)); } llama_logit_info logit_info(ctx); std::vector next_tokens = logit_info.top_k(n_beams); @@ -5038,7 +7144,7 @@ struct llama_beam_search_data { callback(callback_data, get_beams_state(false)); // Sets common_prefix_length update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed. if (common_prefix_length) { - llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads); + llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0)); n_past += common_prefix_length; } // Zero-out next_beam probabilities to place them last in following min-heap. @@ -5079,11 +7185,11 @@ struct llama_beam_search_data { void llama_beam_search(llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, - size_t n_beams, int n_past, int n_predict, int n_threads) { + size_t n_beams, int n_past, int n_predict) { assert(ctx); const int64_t t_start_sample_us = ggml_time_us(); - llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads); + llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict); beam_search_data.loop(callback, callback_data); @@ -5101,6 +7207,24 @@ struct no_init { no_init() { /* do nothing */ } }; +struct quantize_state_internal { + const llama_model & model; + const llama_model_quantize_params * params; + + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; + + int n_k_quantized = 0; + int n_fallback = 0; + + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) + : model(model) + , params(params) + {} +}; + static void llama_convert_tensor_internal( struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -5159,14 +7283,14 @@ static void llama_convert_tensor_internal( workers.clear(); } -#ifdef GGML_USE_K_QUANTS static ggml_type get_k_quant_type( - ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv, - int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2 + quantize_state_internal & qs, + ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype ) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants - const auto tn = LLM_TN(model.arch); + const llm_arch arch = qs.model.arch; + const auto tn = LLM_TN(arch); auto use_more_bits = [](int i_layer, int num_layers) -> bool { return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; @@ -5174,7 +7298,7 @@ static ggml_type get_k_quant_type( if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; - if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } else if (new_type != GGML_TYPE_Q8_0) { @@ -5183,46 +7307,46 @@ static ggml_type get_k_quant_type( } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; - if (model.type == MODEL_70B) { + (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; + if (qs.model.type == MODEL_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // nearly negligible increase in model size by quantizing this tensor with more bits: if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; } - ++*i_attention_wv; + ++qs.i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K - : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K + new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (model.arch == LLM_ARCH_FALCON) { - new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : - use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + if (arch == LLM_ARCH_FALCON) { + new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { new_type = GGML_TYPE_Q5_K; } - ++*i_feed_forward_w2; + ++qs.i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (model.arch != LLM_ARCH_FALCON) { + if (arch != LLM_ARCH_FALCON) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; @@ -5249,25 +7373,27 @@ static ggml_type get_k_quant_type( int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type)); convert_incompatible_tensor = true; + } else { + ++qs.n_k_quantized; } } if (convert_incompatible_tensor) { - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { - new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. - LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); - } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { - new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. - LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); - } else { - throw std::runtime_error("Unsupported tensor size encountered\n"); + switch (new_type) { + case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; + case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; + case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; + case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; + case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; + default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } + LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); + ++qs.n_fallback; } return new_type; } -#endif static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type quantized_type; @@ -5282,7 +7408,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; -#ifdef GGML_USE_K_QUANTS // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: @@ -5293,7 +7418,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; -#endif + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -5303,11 +7428,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - std::unique_ptr ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + // mmap consistently increases speed Linux, and also increases speed on Windows with + // hot cache. It may cause a slowdown on macOS, possibly related to free memory. +#if defined(__linux__) || defined(_WIN32) + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + llama_model_loader ml(fname_inp, use_mmap); + if (ml.use_mmap) { + ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); + } llama_model model; - llm_load_arch(*ml, model); - llm_load_hparams(*ml, model, 0, 0, 0); + llm_load_arch(ml, model); + llm_load_hparams(ml, model); + + struct quantize_state_internal qs(model, params); if (params->only_copy) { ftype = model.ftype; @@ -5317,36 +7455,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml->ctx_gguf); + gguf_set_kv (ctx_out, ml.ctx_gguf); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); -#ifdef GGML_USE_K_QUANTS - int n_attention_wv = 0; - int n_feed_forward_w2 = 0; - - for (int i = 0; i < ml->n_tensors; ++i) { - struct ggml_tensor * meta = ml->get_tensor_meta(i); + for (int i = 0; i < ml.n_tensors; ++i) { + struct ggml_tensor * meta = ml.get_tensor_meta(i); const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos) { - ++n_attention_wv; + if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { + ++qs.n_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { - ++n_feed_forward_w2; + ++qs.n_feed_forward_w2; } } - if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) { + if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", - __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer); + __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); } - int i_attention_wv = 0; - int i_feed_forward_w2 = 0; -#endif - size_t total_size_org = 0; size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); @@ -5362,12 +7492,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> f32_conv_buf; // populate the original tensors so we get an initial meta data - for (int i = 0; i < ml->n_tensors; ++i) { - struct ggml_tensor * meta = ml->get_tensor_meta(i); + for (int i = 0; i < ml.n_tensors; ++i) { + struct ggml_tensor * meta = ml.get_tensor_meta(i); gguf_add_tensor(ctx_out, meta); } std::ofstream fout(fname_out, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors const size_t meta_size = gguf_get_meta_size(ctx_out); @@ -5376,19 +7507,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // placeholder for the meta data ::zeros(fout, meta_size); - for (int i = 0; i < ml->n_tensors; ++i) { - struct ggml_tensor * tensor = ml->get_tensor_meta(i); + for (int i = 0; i < ml.n_tensors; ++i) { + struct ggml_tensor * tensor = ml.get_tensor_meta(i); const std::string name = ggml_get_name(tensor); - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); - ml->load_data_for(tensor); + ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, ml->n_tensors, + ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type)); @@ -5407,11 +7540,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (quantize) { new_type = quantized_type; -#ifdef GGML_USE_K_QUANTS - new_type = get_k_quant_type( - new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2 - ); -#endif + if (!params->pure) { + new_type = get_k_quant_type(qs, new_type, tensor, ftype); + } + // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. quantize = tensor->type != new_type; @@ -5536,10 +7668,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("\n"); } } + + if (qs.n_fallback > 0) { + LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", + __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); + } } -// TODO: after the GGUF PR, this likely won't work and needs to be updated -int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { +static int llama_apply_lora_from_file_internal( + const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads +) { LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); const int64_t t_start_lora_us = ggml_time_us(); @@ -5567,7 +7705,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const int32_t lora_alpha; fin.read((char *) &lora_r, sizeof(lora_r)); fin.read((char *) &lora_alpha, sizeof(lora_alpha)); - float scaling = (float)lora_alpha / (float)lora_r; + float scaling = scale * (float)lora_alpha / (float)lora_r; LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); @@ -5699,14 +7837,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const ggml_tensor * dest_t = model_tensors[base_name]; - offload_func_t offload_func = llama_nop; - offload_func_t offload_func_force_inplace = llama_nop; + offload_func_t offload_func = ggml_offload_nop; + offload_func_t offload_func_force_inplace = ggml_offload_nop; #ifdef GGML_USE_CUBLAS if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { if (dest_t->type != GGML_TYPE_F16) { throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); } offload_func = ggml_cuda_assign_buffers; offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; @@ -5783,9 +7921,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const ggml_set_name(r, "r_cpy"); } - struct ggml_cgraph gf = ggml_build_forward(r); + struct ggml_cgraph * gf = ggml_new_graph(lora_ctx); + ggml_build_forward_expand(gf, r); - ggml_graph_compute_helper(work_buffer, &gf, n_threads); + ggml_graph_compute_helper(work_buffer, gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -5814,27 +7953,16 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // // interface implementation // - -struct llama_context_params llama_context_default_params() { - struct llama_context_params result = { - /*.seed =*/ LLAMA_DEFAULT_SEED, - /*.n_ctx =*/ 512, - /*.n_batch =*/ 512, +struct llama_model_params llama_model_default_params() { + struct llama_model_params result = { /*.n_gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, - /*.rope_freq_base =*/ 10000.0f, - /*.rope_freq_scale =*/ 1.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, - /*.low_vram =*/ false, - /*.mul_mat_q =*/ true, - /*.f16_kv =*/ true, - /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, - /*.embedding =*/ false, }; #ifdef GGML_USE_METAL @@ -5844,6 +7972,30 @@ struct llama_context_params llama_context_default_params() { return result; } +struct llama_context_params llama_context_default_params() { + struct llama_context_params result = { + /*.seed =*/ LLAMA_DEFAULT_SEED, + /*.n_ctx =*/ 512, + /*.n_batch =*/ 512, + /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default + /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, + /*.rope_freq_base =*/ 0.0f, + /*.rope_freq_scale =*/ 0.0f, + /*.yarn_ext_factor =*/ -1.0f, + /*.yarn_attn_factor =*/ 1.0f, + /*.yarn_beta_fast =*/ 32.0f, + /*.yarn_beta_slow =*/ 1.0f, + /*.yarn_orig_ctx =*/ 0, + /*.mul_mat_q =*/ true, + /*.f16_kv =*/ true, + /*.logits_all =*/ false, + /*.embedding =*/ false, + }; + + return result; +} + struct llama_model_quantize_params llama_model_quantize_default_params() { struct llama_model_quantize_params result = { /*.nthread =*/ 0, @@ -5851,6 +8003,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, + /*.pure =*/ false, }; return result; @@ -5899,13 +8052,11 @@ int64_t llama_time_us(void) { struct llama_model * llama_load_model_from_file( const char * path_model, - struct llama_context_params params) { + struct llama_model_params params) { ggml_time_init(); llama_model * model = new llama_model; - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; @@ -5922,10 +8073,7 @@ struct llama_model * llama_load_model_from_file( }; } - if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, - params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, - params.progress_callback, params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *model, params)) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; return nullptr; @@ -5948,18 +8096,55 @@ struct llama_context * llama_new_context_with_model( llama_context * ctx = new llama_context(*model); + const auto & hparams = model->hparams; + auto & cparams = ctx->cparams; + + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.mul_mat_q = params.mul_mat_q; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + + cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : + hparams.n_ctx_train; + + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; + } + + if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + } + + if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + } + if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); } + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; // reserve memory for context buffers - if (!params.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + if (!hparams.vocab_only) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -5970,11 +8155,9 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } - const auto & hparams = ctx->model.hparams; - // resized during inference if (params.logits_all) { - ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); + ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab); } else { ctx->logits.reserve(hparams.n_vocab); } @@ -5986,32 +8169,35 @@ struct llama_context * llama_new_context_with_model( { static const size_t tensor_alignment = 32; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead()); // create measure allocator ctx->alloc = ggml_allocr_new_measure(tensor_alignment); // build worst-case graph - int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); - int n_past = hparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); + int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); + int n_past = cparams.n_ctx - n_tokens; + llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); + #ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { + if (model->n_gpu_layers > 0) { + ggml_metal_log_set_callback(llama_log_callback_default, NULL); + ctx->ctx_metal = ggml_metal_init(1); if (!ctx->ctx_metal) { LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); llama_free(ctx); return NULL; } - ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); - ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); + //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); } #endif // measure memory requirements for the graph size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); // recreate allocator with exact memory requirements ggml_allocr_free(ctx->alloc); @@ -6020,28 +8206,46 @@ struct llama_context * llama_new_context_with_model( ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); #ifdef GGML_USE_METAL if (ctx->ctx_metal) { - ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); } #endif #ifdef GGML_USE_CUBLAS - if (params.low_vram) { - LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); - ggml_cuda_set_scratch_size(0); // disable scratch - } else { - ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + ggml_cuda_set_scratch_size(alloc_size); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + + // calculate total VRAM usage + auto add_tensor = [](const ggml_tensor * t, size_t & size) { + if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { + size += ggml_nbytes(t); + } + }; + size_t model_vram_size = 0; + for (const auto & kv : model->tensors_by_name) { + add_tensor(kv.second, model_vram_size); } + + size_t kv_vram_size = 0; + add_tensor(ctx->kv_self.k, kv_vram_size); + add_tensor(ctx->kv_self.v, kv_vram_size); + + size_t ctx_vram_size = alloc_size + kv_vram_size; + size_t total_vram_size = model_vram_size + ctx_vram_size; + + LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__, + total_vram_size / 1024.0 / 1024.0, + model_vram_size / 1024.0 / 1024.0, + ctx_vram_size / 1024.0 / 1024.0); #endif } #ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { + if (model->n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers void * data_ptr = NULL; size_t data_size = 0; - if (params.use_mmap) { + if (ctx->model.mapping) { data_ptr = ctx->model.mapping->addr; data_size = ctx->model.mapping->size; } else { @@ -6060,11 +8264,8 @@ struct llama_context * llama_new_context_with_model( return NULL; \ } - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); - + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); #undef LLAMA_METAL_CHECK_BUF } @@ -6076,8 +8277,10 @@ struct llama_context * llama_new_context_with_model( if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { // Enter a blocking eval loop with dummy input, letting rank=0 drive the process - const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx)); - while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; + // TODO: needs fix after #3228 + GGML_ASSERT(false && "not implemented"); + //const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx)); + //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; llama_backend_free(); exit(1); } @@ -6086,63 +8289,41 @@ struct llama_context * llama_new_context_with_model( return ctx; } -struct llama_context * llama_init_from_file( - const char * path_model, - struct llama_context_params params) { - struct llama_model * model = llama_load_model_from_file(path_model, params); - if (!model) { - return nullptr; - } - - struct llama_context * ctx = llama_new_context_with_model(model, params); - ctx->model_owner = true; - - return ctx; -} - void llama_free(struct llama_context * ctx) { delete ctx; } -int llama_n_vocab(const struct llama_context * ctx) { - return llama_model_n_vocab(&ctx->model); +const llama_model * llama_get_model(const struct llama_context * ctx) { + return &ctx->model; } int llama_n_ctx(const struct llama_context * ctx) { - return llama_model_n_ctx(&ctx->model); + return ctx->cparams.n_ctx; } -int llama_n_ctx_train(const struct llama_context * ctx) { - return llama_model_n_ctx_train(&ctx->model); +enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { + return model->vocab.type; } -int llama_n_embd(const struct llama_context * ctx) { - return llama_model_n_embd(&ctx->model); -} - -enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) { - return ctx->model.vocab.type; -} - -int llama_model_n_vocab(const struct llama_model * model) { +int llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } -int llama_model_n_ctx(const struct llama_model * model) { - return model->hparams.n_ctx; -} - -int llama_model_n_ctx_train(const struct llama_model * model) { +int llama_n_ctx_train(const struct llama_model * model) { return model->hparams.n_ctx_train; } -int llama_model_n_embd(const struct llama_model * model) { +int llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } +float llama_rope_freq_scale_train(const struct llama_model * model) { + return model->hparams.rope_freq_scale_train; +} + int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", - model->name.c_str(), + llama_model_arch_name(model->arch).c_str(), llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } @@ -6163,6 +8344,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) { return nparams; } +struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { + return ggml_get_tensor(model->ctx, name); +} + int llama_model_quantize( const char * fname_inp, const char * fname_out, @@ -6176,18 +8361,18 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } } -int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; @@ -6195,16 +8380,30 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha } int llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return ctx->kv_self.n; + return ctx->kv_self.head; } -#define LLAMA_MAX_RNG_STATE (64*1024) +void llama_kv_cache_clear(struct llama_context * ctx) { + llama_kv_cache_clear(ctx->kv_self); +} -void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) { - if (seed == LLAMA_DEFAULT_SEED) { - seed = time(NULL); +void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); +} + +void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; } - ctx->rng.seed(seed); + llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { + llama_kv_cache_seq_keep(ctx->kv_self, seq_id); +} + +void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta); } // Returns the *maximum* size of the state @@ -6291,7 +8490,7 @@ struct llama_data_file_context : llama_data_context { * llama_copy_state_data(ctx, &data_ctx); * */ -void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { +static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { // copy rng { std::stringstream rng_ss; @@ -6342,41 +8541,45 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; - const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = hparams.n_ctx; + const auto & cparams = ctx->cparams; - const size_t kv_size = kv_self.buf.size; - const int kv_ntok = llama_get_kv_cache_token_count(ctx); + const auto n_layer = hparams.n_layer; + const auto n_embd = hparams.n_embd_gqa(); + const auto n_ctx = cparams.n_ctx; - data_ctx->write(&kv_size, sizeof(kv_size)); - data_ctx->write(&kv_ntok, sizeof(kv_ntok)); + const size_t kv_buf_size = kv_self.buf.size; + const uint32_t kv_head = kv_self.head; + const uint32_t kv_size = kv_self.size; - if (kv_size) { + data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); + data_ctx->write(&kv_head, sizeof(kv_head)); + data_ctx->write(&kv_size, sizeof(kv_size)); + + if (kv_buf_size) { const size_t elt_size = ggml_element_size(kv_self.k); - ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); + ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); std::vector kout3d_data(ggml_nbytes(kout3d), 0); kout3d->data = kout3d_data.data(); - ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer); std::vector vout3d_data(ggml_nbytes(vout3d), 0); vout3d->data = vout3d_data.data(); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, + n_embd, kv_head, n_layer, elt_size*n_embd, elt_size*n_embd*n_ctx, 0); ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, + kv_head, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d)); + ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); ggml_free(cpy_ctx); @@ -6385,6 +8588,20 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte data_ctx->write(kout3d_data.data(), kout3d_data.size()); data_ctx->write(vout3d_data.data(), vout3d_data.size()); } + + for (uint32_t i = 0; i < kv_size; ++i) { + const auto & cell = kv_self.cells[i]; + + const llama_pos pos = cell.pos; + const size_t seq_id_size = cell.seq_id.size(); + + data_ctx->write(&pos, sizeof(pos)); + data_ctx->write(&seq_id_size, sizeof(seq_id_size)); + + for (auto seq_id : cell.seq_id) { + data_ctx->write(&seq_id, sizeof(seq_id)); + } + } } } @@ -6450,48 +8667,72 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; + const auto & cparams = ctx->cparams; + const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = hparams.n_ctx; + const int n_ctx = cparams.n_ctx; - size_t kv_size; - int kv_ntok; + size_t kv_buf_size; + uint32_t kv_head; + uint32_t kv_size; - memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); + memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size); + memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head); + memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - if (kv_size) { - GGML_ASSERT(kv_self.buf.size == kv_size); + if (kv_buf_size) { + GGML_ASSERT(kv_self.buf.size == kv_buf_size); const size_t elt_size = ggml_element_size(kv_self.k); - ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); + ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); kin3d->data = (void *) inp; inp += ggml_nbytes(kin3d); - ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer); vin3d->data = (void *) inp; inp += ggml_nbytes(vin3d); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, + n_embd, kv_head, n_layer, elt_size*n_embd, elt_size*n_embd*n_ctx, 0); ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, + kv_head, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d)); + ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } - ctx->kv_self.n = kv_ntok; + ctx->kv_self.head = kv_head; + ctx->kv_self.size = kv_size; + + ctx->kv_self.cells.resize(kv_size); + + for (uint32_t i = 0; i < kv_size; ++i) { + llama_pos pos; + size_t seq_id_size; + + memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos); + memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size); + + ctx->kv_self.cells[i].pos = pos; + + llama_seq_id seq_id; + + for (size_t j = 0; j < seq_id_size; ++j) { + memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id); + ctx->kv_self.cells[i].seq_id.insert(seq_id); + } + } } const size_t nread = inp - src; @@ -6586,108 +8827,166 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi int llama_eval( struct llama_context * ctx, - const llama_token * tokens, - int n_tokens, - int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; - } + llama_token * tokens, + int32_t n_tokens, + int n_past) { + llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); - // get a more accurate load time, upon first eval - // TODO: fix this - if (!ctx->has_evaluated_once) { - ctx->t_load_us = ggml_time_us() - ctx->t_start_us; - ctx->has_evaluated_once = true; + const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0)); + if (ret < 0) { + LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); } - return 0; + return ret; } int llama_eval_embd( struct llama_context * ctx, - const float * embd, - int n_tokens, - int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; - } + float * embd, + int32_t n_tokens, + int n_past) { + llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); - // get a more accurate load time, upon first eval - // TODO: fix this - if (!ctx->has_evaluated_once) { - ctx->t_load_us = ggml_time_us() - ctx->t_start_us; - ctx->has_evaluated_once = true; + llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; + + const int ret = llama_decode_internal(*ctx, batch); + if (ret < 0) { + LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); } - return 0; + return ret; +} + +void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) { + ctx->cparams.n_threads = n_threads; + ctx->cparams.n_threads_batch = n_threads_batch; +} + +struct llama_batch llama_batch_get_one( + llama_token * tokens, + int32_t n_tokens, + llama_pos pos_0, + llama_seq_id seq_id) { + return { + /*n_tokens =*/ n_tokens, + /*tokens =*/ tokens, + /*embd =*/ nullptr, + /*pos =*/ nullptr, + /*n_seq_id =*/ nullptr, + /*seq_id =*/ nullptr, + /*logits =*/ nullptr, + /*all_pos_0 =*/ pos_0, + /*all_pos_1 =*/ 1, + /*all_seq_id =*/ seq_id, + }; } -int llama_eval_export(struct llama_context * ctx, const char * fname) { - const int n_batch = 1; - const int n_ctx = 512 - n_batch; +struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) { + llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; - const std::vector tmp(n_batch, llama_token_bos(ctx)); + if (embd) { + batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd); + } else { + batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); + } - if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; + batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); + batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); + batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens); + for (int i = 0; i < n_tokens; ++i) { + batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); } + batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); - return 0; + return batch; +} + +void llama_batch_free(struct llama_batch batch) { + if (batch.token) free(batch.token); + if (batch.embd) free(batch.embd); + if (batch.pos) free(batch.pos); + if (batch.n_seq_id) free(batch.n_seq_id); + if (batch.seq_id) { + for (int i = 0; i < batch.n_tokens; ++i) { + free(batch.seq_id[i]); + } + free(batch.seq_id); + } + if (batch.logits) free(batch.logits); +} + +int llama_decode( + struct llama_context * ctx, + struct llama_batch batch) { + const int ret = llama_decode_internal(*ctx, batch); + if (ret < 0) { + LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); + } + + return ret; } float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } +float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + return ctx->logits.data() + i*ctx->model.hparams.n_vocab; +} + float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].text.c_str(); +const char * llama_token_get_text(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].text.c_str(); } -float llama_token_get_score(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].score; +float llama_token_get_score(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].score; } -llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].type; +llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].type; } -llama_token llama_token_bos(const struct llama_context * ctx) { - return ctx->model.vocab.special_bos_id; +llama_token llama_token_bos(const struct llama_model * model) { + return model->vocab.special_bos_id; } -llama_token llama_token_eos(const struct llama_context * ctx) { - return ctx->model.vocab.special_eos_id; +llama_token llama_token_eos(const struct llama_model * model) { + return model->vocab.special_eos_id; } -llama_token llama_token_nl(const struct llama_context * ctx) { - return ctx->model.vocab.linefeed_id; +llama_token llama_token_nl(const struct llama_model * model) { + return model->vocab.linefeed_id; } -int llama_tokenize( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); +llama_token llama_token_prefix(const struct llama_model * model) { + return model->vocab.special_prefix_id; +} + +llama_token llama_token_middle(const struct llama_model * model) { + return model->vocab.special_middle_id; } -int llama_tokenize_with_model( +llama_token llama_token_suffix(const struct llama_model * model) { + return model->vocab.special_suffix_id; +} + +llama_token llama_token_eot(const struct llama_model * model) { + return model->vocab.special_eot_id; +} + +int llama_tokenize( const struct llama_model * model, const char * text, + int text_len, llama_token * tokens, int n_max_tokens, - bool add_bos) { - auto res = llama_tokenize_internal(model->vocab, text, add_bos); + bool add_bos, + bool special) { + auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); if (n_max_tokens < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); @@ -6701,39 +9000,70 @@ int llama_tokenize_with_model( return res.size(); } -int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) { - return llama_token_to_piece_with_model(&ctx->model, token, buf, length); +static std::string llama_decode_text(const std::string & text) { + std::string decoded_text; + auto unicode_sequences = codepoints_from_utf8(text); + for (auto& unicode_sequence : unicode_sequences) { + decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence)); + } + + return decoded_text; } // does not write null-terminator to buf -int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { - if (0 <= token && token < llama_model_n_vocab(model)) { - if (llama_is_normal_token(model->vocab, token)) { - std::string result = model->vocab.id_to_token[token].text; - if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) { +int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) { + if (0 <= token && token < llama_n_vocab(model)) { + switch (llama_vocab_get_type(model->vocab)) { + case LLAMA_VOCAB_TYPE_SPM: { + if (llama_is_normal_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; llama_unescape_whitespace(result); + if (length < (int) result.length()) { + return -result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); + } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT + if (length < 3) { + return -3; + } + memcpy(buf, "\xe2\x96\x85", 3); + return 3; + } else if (llama_is_control_token(model->vocab, token)) { + ; + } else if (llama_is_byte_token(model->vocab, token)) { + if (length < 1) { + return -1; + } + buf[0] = llama_token_to_byte(model->vocab, token); + return 1; + } else { + // TODO: for now we accept all unsupported token types, + // suppressing them like CONTROL tokens. + // GGML_ASSERT(false); } - if (length < (int) result.length()) { - return -result.length(); - } - memcpy(buf, result.c_str(), result.length()); - return result.length(); - } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT - if (length < 3) { - return -3; - } - buf[0] = '\xe2'; - buf[1] = '\x96'; - buf[2] = '\x85'; - return 3; - } else if (llama_is_control_token(model->vocab, token)) { - ; - } else if (llama_is_byte_token(model->vocab, token)) { - if (length < 1) { - return -1; + break; + } + case LLAMA_VOCAB_TYPE_BPE: { + if (llama_is_normal_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; + result = llama_decode_text(result); + if (length < (int) result.length()) { + return -result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); + } else if (llama_is_control_token(model->vocab, token)) { + ; + } else { + // TODO: for now we accept all unsupported token types, + // suppressing them like CONTROL tokens. + // GGML_ASSERT(false); } - buf[0] = llama_token_to_byte(model->vocab, token); - return 1; + break; + } + default: + GGML_ASSERT(false); } } return 0; @@ -6760,14 +9090,14 @@ void llama_print_timings(struct llama_context * ctx) { const llama_timings timings = llama_get_timings(ctx); LLAMA_LOG_INFO("\n"); - LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); - LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); + LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); - LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } void llama_reset_timings(struct llama_context * ctx) { @@ -6829,16 +9159,18 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { } // For internal test use -const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +const std::vector> & llama_internal_get_tensor_map( + struct llama_context * ctx +) { return ctx->model.tensors_by_name; } -void llama_log_set(llama_log_callback log_callback, void * user_data) { +void llama_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; } -static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { +static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) { va_list args_copy; va_copy(args_copy, args); char buffer[128]; @@ -6855,14 +9187,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_ va_end(args_copy); } -static void llama_log_internal(llama_log_level level, const char * format, ...) { +static void llama_log_internal(ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); llama_log_internal_v(level, format, args); va_end(args); } -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { +static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; fputs(text, stderr);