ggerganov commited on
Commit
aa42df9
·
unverified ·
1 Parent(s): be7d266

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -196,6 +196,7 @@ enum llm_arch {
196
  LLM_ARCH_STARCODER,
197
  LLM_ARCH_PERSIMMON,
198
  LLM_ARCH_REFACT,
 
199
  LLM_ARCH_BLOOM,
200
  LLM_ARCH_STABLELM,
201
  LLM_ARCH_QWEN,
@@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
220
  { LLM_ARCH_STARCODER, "starcoder" },
221
  { LLM_ARCH_PERSIMMON, "persimmon" },
222
  { LLM_ARCH_REFACT, "refact" },
 
223
  { LLM_ARCH_BLOOM, "bloom" },
224
  { LLM_ARCH_STABLELM, "stablelm" },
225
  { LLM_ARCH_QWEN, "qwen" },
@@ -261,6 +263,7 @@ enum llm_kv {
261
  LLM_KV_ATTENTION_VALUE_LENGTH,
262
  LLM_KV_ATTENTION_LAYERNORM_EPS,
263
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
 
264
 
265
  LLM_KV_ROPE_DIMENSION_COUNT,
266
  LLM_KV_ROPE_FREQ_BASE,
@@ -273,6 +276,7 @@ enum llm_kv {
273
  LLM_KV_TOKENIZER_MODEL,
274
  LLM_KV_TOKENIZER_LIST,
275
  LLM_KV_TOKENIZER_TOKEN_TYPE,
 
276
  LLM_KV_TOKENIZER_SCORES,
277
  LLM_KV_TOKENIZER_MERGES,
278
  LLM_KV_TOKENIZER_BOS_ID,
@@ -316,6 +320,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
316
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
317
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
318
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
 
319
 
320
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
321
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -328,6 +333,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
328
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
329
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
330
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
 
331
  { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
332
  { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
333
  { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
@@ -355,6 +361,7 @@ struct LLM_KV {
355
  enum llm_tensor {
356
  LLM_TENSOR_TOKEN_EMBD,
357
  LLM_TENSOR_TOKEN_EMBD_NORM,
 
358
  LLM_TENSOR_POS_EMBD,
359
  LLM_TENSOR_OUTPUT,
360
  LLM_TENSOR_OUTPUT_NORM,
@@ -536,6 +543,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
536
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
537
  },
538
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  {
540
  LLM_ARCH_BLOOM,
541
  {
@@ -1440,6 +1464,11 @@ static llama_state g_state;
1440
  // available llama models
1441
  enum e_model {
1442
  MODEL_UNKNOWN,
 
 
 
 
 
1443
  MODEL_0_5B,
1444
  MODEL_1B,
1445
  MODEL_2B,
@@ -1481,6 +1510,7 @@ struct llama_hparams {
1481
  uint32_t n_ff;
1482
  uint32_t n_expert = 0;
1483
  uint32_t n_expert_used = 0;
 
1484
 
1485
  float f_norm_eps;
1486
  float f_norm_rms_eps;
@@ -1493,6 +1523,8 @@ struct llama_hparams {
1493
  float f_clamp_kqv;
1494
  float f_max_alibi_bias;
1495
 
 
 
1496
 
1497
  bool operator!=(const llama_hparams & other) const {
1498
  if (this->vocab_only != other.vocab_only) return true;
@@ -1720,6 +1752,7 @@ struct llama_model {
1720
  llama_vocab vocab;
1721
 
1722
  struct ggml_tensor * tok_embd;
 
1723
  struct ggml_tensor * pos_embd;
1724
  struct ggml_tensor * tok_norm;
1725
  struct ggml_tensor * tok_norm_b;
@@ -1839,8 +1872,6 @@ struct llama_context {
1839
  // memory buffers used to evaluate the model
1840
  std::vector<uint8_t> buf_compute_meta;
1841
  ggml_backend_sched_t sched = nullptr;
1842
- // allocator for the input tensors
1843
- ggml_tallocr * alloc = nullptr;
1844
 
1845
  // input tensors
1846
  ggml_backend_buffer_t buf_input = nullptr;
@@ -1850,6 +1881,7 @@ struct llama_context {
1850
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1851
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1852
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
 
1853
 
1854
  #ifdef GGML_USE_MPI
1855
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2829,6 +2861,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2829
  switch (type) {
2830
  case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2831
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
 
2832
  default: return "unknown";
2833
  }
2834
  }
@@ -3000,6 +3033,26 @@ static void llm_load_hparams(
3000
  default: model.type = e_model::MODEL_UNKNOWN;
3001
  }
3002
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3003
  case LLM_ARCH_BLOOM:
3004
  {
3005
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3204,6 +3257,16 @@ static void llm_load_vocab(
3204
  vocab.special_unk_id = -1;
3205
  vocab.special_sep_id = -1;
3206
  vocab.special_pad_id = -1;
 
 
 
 
 
 
 
 
 
 
3207
  } else {
3208
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
3209
  LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -3232,6 +3295,8 @@ static void llm_load_vocab(
3232
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3233
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3234
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
 
 
3235
  } else {
3236
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
3237
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -3569,6 +3634,7 @@ static bool llm_load_tensors(
3569
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3570
  const int64_t n_embd_gqa = n_embd_v_gqa;
3571
  const int64_t n_vocab = hparams.n_vocab;
 
3572
  const int64_t n_ff = hparams.n_ff;
3573
 
3574
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
@@ -3783,11 +3849,50 @@ static bool llm_load_tensors(
3783
  layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3784
  }
3785
  } break;
3786
- case LLM_ARCH_BLOOM:
3787
  {
3788
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3789
- model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3790
- model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3791
 
3792
  // output
3793
  {
@@ -4739,6 +4844,7 @@ struct llm_build_context {
4739
  const int32_t n_orig_ctx;
4740
 
4741
  const bool do_rope_shift;
 
4742
 
4743
  const llm_build_cb & cb;
4744
 
@@ -4782,6 +4888,7 @@ struct llm_build_context {
4782
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4783
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4784
  do_rope_shift (worst_case || kv_self.has_shift),
 
4785
  cb (cb),
4786
  buf_compute_meta (lctx.buf_compute_meta) {
4787
  // all initializations should be done in init()
@@ -5625,6 +5732,100 @@ struct llm_build_context {
5625
  return gf;
5626
  }
5627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5628
  struct ggml_cgraph * build_bloom() {
5629
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5630
 
@@ -6996,12 +7197,10 @@ struct llm_build_context {
6996
 
6997
  static struct ggml_cgraph * llama_build_graph(
6998
  llama_context & lctx,
6999
- const llama_batch & batch) {
 
7000
  const auto & model = lctx.model;
7001
 
7002
- // check if we should build the worst-case graph (for memory measurement)
7003
- const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
7004
-
7005
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7006
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7007
  if (il >= 0) {
@@ -7022,67 +7221,6 @@ static struct ggml_cgraph * llama_build_graph(
7022
 
7023
  struct llm_build_context llm(lctx, batch, cb, worst_case);
7024
 
7025
- //
7026
- // set input data
7027
- //
7028
-
7029
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
7030
- if (batch.token) {
7031
- const int64_t n_tokens = batch.n_tokens;
7032
-
7033
- ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7034
- }
7035
-
7036
- if (batch.embd) {
7037
- const int64_t n_embd = llm.n_embd;
7038
- const int64_t n_tokens = batch.n_tokens;
7039
-
7040
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7041
- }
7042
-
7043
- if (batch.pos) {
7044
- const int64_t n_tokens = batch.n_tokens;
7045
-
7046
- ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7047
- }
7048
-
7049
- {
7050
- const int64_t n_kv = llm.n_kv;
7051
- const int64_t n_tokens = batch.n_tokens;
7052
-
7053
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7054
- float * data = (float *) lctx.inp_KQ_mask->data;
7055
-
7056
- for (int h = 0; h < 1; ++h) {
7057
- for (int j = 0; j < n_tokens; ++j) {
7058
- const llama_pos pos = batch.pos[j];
7059
- const llama_seq_id seq_id = batch.seq_id[j][0];
7060
-
7061
- for (int i = 0; i < n_kv; ++i) {
7062
- float f;
7063
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7064
- f = -INFINITY;
7065
- } else {
7066
- f = 0;
7067
- }
7068
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7069
- }
7070
- }
7071
- }
7072
- }
7073
-
7074
- if (llm.do_rope_shift) {
7075
- const int64_t n_ctx = llm.n_ctx;
7076
-
7077
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7078
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7079
-
7080
- for (int i = 0; i < n_ctx; ++i) {
7081
- data[i] = lctx.kv_self.cells[i].delta;
7082
- }
7083
- }
7084
- }
7085
-
7086
  llm.init();
7087
 
7088
  switch (model.arch) {
@@ -7110,6 +7248,10 @@ static struct ggml_cgraph * llama_build_graph(
7110
  {
7111
  result = llm.build_refact();
7112
  } break;
 
 
 
 
7113
  case LLM_ARCH_BLOOM:
7114
  {
7115
  result = llm.build_bloom();
@@ -7167,6 +7309,83 @@ static struct ggml_cgraph * llama_build_graph(
7167
  return result;
7168
  }
7169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7170
  // decode a batch of tokens by evaluating the transformer
7171
  //
7172
  // - lctx: llama context
@@ -7265,17 +7484,22 @@ static int llama_decode_internal(
7265
  ggml_backend_sched_reset(lctx.sched);
7266
  ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
7267
 
7268
- ggml_cgraph * gf = llama_build_graph(lctx, batch);
7269
 
7270
  // the output is always the last tensor in the graph
7271
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7272
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
7273
-
7274
- // the embeddings could be the second to last tensor, or the third to last tensor
7275
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7276
- if (strcmp(embeddings->name, "result_norm") != 0) {
7277
- embeddings = gf->nodes[gf->n_nodes - 3];
7278
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
 
 
 
 
 
 
 
 
7279
  }
7280
 
7281
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -7305,6 +7529,9 @@ static int llama_decode_internal(
7305
  if (lctx.backend_cpu != nullptr) {
7306
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7307
  }
 
 
 
7308
  ggml_backend_sched_graph_compute(lctx.sched, gf);
7309
 
7310
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
@@ -7344,7 +7571,7 @@ static int llama_decode_internal(
7344
  // extract logits
7345
  // TODO: do not compute and extract logits if only embeddings are needed
7346
  // need to update the graphs to skip "result_output"
7347
- {
7348
  auto & logits_out = lctx.logits;
7349
 
7350
  #ifndef NDEBUG
@@ -7388,9 +7615,11 @@ static int llama_decode_internal(
7388
  if (!lctx.embedding.empty()) {
7389
  auto & embedding_out = lctx.embedding;
7390
 
 
 
7391
  embedding_out.resize(n_embd);
7392
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7393
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
7394
  ggml_backend_synchronize(embeddings_backend);
7395
  }
7396
 
@@ -7454,6 +7683,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
7454
  GGML_ASSERT(false);
7455
  return unicode_to_bytes_bpe(token_data.text);
7456
  }
 
 
 
7457
  default:
7458
  GGML_ASSERT(false);
7459
  }
@@ -7466,6 +7698,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
7466
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7467
  return vocab.token_to_id.at(buf);
7468
  }
 
7469
  case LLAMA_VOCAB_TYPE_BPE: {
7470
  return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
7471
  }
@@ -7936,12 +8169,212 @@ private:
7936
  llm_bigram_bpe::queue work_queue;
7937
  };
7938
 
7939
- typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7940
  FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
7941
  FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
7942
  } FRAGMENT_BUFFER_VARIANT_TYPE;
7943
 
7944
- struct fragment_buffer_variant{
7945
  fragment_buffer_variant(llama_vocab::id _token)
7946
  :
7947
  type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
@@ -7971,8 +8404,7 @@ struct fragment_buffer_variant{
7971
 
7972
  // #define PRETOKENIZERDEBUG
7973
 
7974
- static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
7975
- {
7976
  // for each special token
7977
  for (const auto & st: vocab.special_tokens_cache) {
7978
  const auto & special_token = st.first;
@@ -8090,10 +8522,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8090
  switch (vocab.type) {
8091
  case LLAMA_VOCAB_TYPE_SPM:
8092
  {
8093
- for (const auto & fragment: fragment_buffer)
8094
- {
8095
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8096
- {
8097
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
8098
 
8099
  // TODO: It's likely possible to get rid of this string copy entirely
@@ -8113,19 +8543,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8113
  llm_tokenizer_spm tokenizer(vocab);
8114
  llama_escape_whitespace(raw_text);
8115
  tokenizer.tokenize(raw_text, output);
8116
- }
8117
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8118
- {
8119
  output.push_back(fragment.token);
8120
  }
8121
  }
8122
  } break;
8123
  case LLAMA_VOCAB_TYPE_BPE:
8124
  {
8125
- for (const auto & fragment: fragment_buffer)
8126
- {
8127
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8128
- {
8129
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8130
 
8131
  #ifdef PRETOKENIZERDEBUG
@@ -8133,9 +8559,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8133
  #endif
8134
  llm_tokenizer_bpe tokenizer(vocab);
8135
  tokenizer.tokenize(raw_text, output);
 
 
8136
  }
8137
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8138
- {
 
 
 
 
 
 
 
 
 
 
 
 
8139
  output.push_back(fragment.token);
8140
  }
8141
  }
@@ -10799,7 +11239,7 @@ struct llama_context * llama_new_context_with_model(
10799
  // graph inputs
10800
  {
10801
  ggml_init_params init_params = {
10802
- /* .mem_size */ ggml_tensor_overhead()*5,
10803
  /* .mem_buffer */ nullptr,
10804
  /* .no_alloc */ true,
10805
  };
@@ -10810,12 +11250,14 @@ struct llama_context * llama_new_context_with_model(
10810
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10811
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
10812
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
 
10813
 
10814
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
10815
  ggml_set_name(ctx->inp_embd, "inp_embd");
10816
  ggml_set_name(ctx->inp_pos, "inp_pos");
10817
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
10818
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
 
10819
 
10820
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
10821
 
@@ -10841,23 +11283,27 @@ struct llama_context * llama_new_context_with_model(
10841
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
10842
 
10843
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
10844
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
10845
 
10846
  // build worst-case graph
10847
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
10848
  int n_past = cparams.n_ctx - n_tokens;
10849
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
10850
- ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
10851
 
10852
  // initialize scheduler with the worst-case graph
10853
- ggml_backend_sched_init_measure(ctx->sched, gf);
10854
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
 
 
 
10855
 
10856
- for (ggml_backend_t backend : ctx->backends) {
10857
- ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
 
 
10858
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
10859
- ggml_backend_buffer_name(buf),
10860
- ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
10861
  }
10862
 
10863
  // note: the number of splits during measure is higher than during inference due to the kv shift
@@ -11746,6 +12192,7 @@ static std::string llama_decode_text(const std::string & text) {
11746
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
11747
  if (0 <= token && token < llama_n_vocab(model)) {
11748
  switch (llama_vocab_get_type(model->vocab)) {
 
11749
  case LLAMA_VOCAB_TYPE_SPM: {
11750
  // NOTE: we accept all unsupported token types,
11751
  // suppressing them like CONTROL tokens.
@@ -11869,6 +12316,7 @@ const char * llama_print_system_info(void) {
11869
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
11870
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
11871
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
 
11872
 
11873
  return s.c_str();
11874
  }
 
196
  LLM_ARCH_STARCODER,
197
  LLM_ARCH_PERSIMMON,
198
  LLM_ARCH_REFACT,
199
+ LLM_ARCH_BERT,
200
  LLM_ARCH_BLOOM,
201
  LLM_ARCH_STABLELM,
202
  LLM_ARCH_QWEN,
 
221
  { LLM_ARCH_STARCODER, "starcoder" },
222
  { LLM_ARCH_PERSIMMON, "persimmon" },
223
  { LLM_ARCH_REFACT, "refact" },
224
+ { LLM_ARCH_BERT, "bert" },
225
  { LLM_ARCH_BLOOM, "bloom" },
226
  { LLM_ARCH_STABLELM, "stablelm" },
227
  { LLM_ARCH_QWEN, "qwen" },
 
263
  LLM_KV_ATTENTION_VALUE_LENGTH,
264
  LLM_KV_ATTENTION_LAYERNORM_EPS,
265
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
266
+ LLM_KV_ATTENTION_CAUSAL,
267
 
268
  LLM_KV_ROPE_DIMENSION_COUNT,
269
  LLM_KV_ROPE_FREQ_BASE,
 
276
  LLM_KV_TOKENIZER_MODEL,
277
  LLM_KV_TOKENIZER_LIST,
278
  LLM_KV_TOKENIZER_TOKEN_TYPE,
279
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
280
  LLM_KV_TOKENIZER_SCORES,
281
  LLM_KV_TOKENIZER_MERGES,
282
  LLM_KV_TOKENIZER_BOS_ID,
 
320
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
321
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
322
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
323
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
324
 
325
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
326
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
 
333
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
334
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
335
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
336
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
337
  { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
338
  { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
339
  { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
 
361
  enum llm_tensor {
362
  LLM_TENSOR_TOKEN_EMBD,
363
  LLM_TENSOR_TOKEN_EMBD_NORM,
364
+ LLM_TENSOR_TOKEN_TYPES,
365
  LLM_TENSOR_POS_EMBD,
366
  LLM_TENSOR_OUTPUT,
367
  LLM_TENSOR_OUTPUT_NORM,
 
543
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
544
  },
545
  },
546
+ {
547
+ LLM_ARCH_BERT,
548
+ {
549
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
550
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
551
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
552
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
553
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
554
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
555
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
556
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
557
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
558
+ { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
559
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
560
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
561
+ },
562
+ },
563
  {
564
  LLM_ARCH_BLOOM,
565
  {
 
1464
  // available llama models
1465
  enum e_model {
1466
  MODEL_UNKNOWN,
1467
+ MODEL_17M,
1468
+ MODEL_22M,
1469
+ MODEL_33M,
1470
+ MODEL_109M,
1471
+ MODEL_335M,
1472
  MODEL_0_5B,
1473
  MODEL_1B,
1474
  MODEL_2B,
 
1510
  uint32_t n_ff;
1511
  uint32_t n_expert = 0;
1512
  uint32_t n_expert_used = 0;
1513
+ uint32_t n_vocab_type = 0; // for BERT-style token types
1514
 
1515
  float f_norm_eps;
1516
  float f_norm_rms_eps;
 
1523
  float f_clamp_kqv;
1524
  float f_max_alibi_bias;
1525
 
1526
+ bool causal_attn = true;
1527
+
1528
 
1529
  bool operator!=(const llama_hparams & other) const {
1530
  if (this->vocab_only != other.vocab_only) return true;
 
1752
  llama_vocab vocab;
1753
 
1754
  struct ggml_tensor * tok_embd;
1755
+ struct ggml_tensor * type_embd;
1756
  struct ggml_tensor * pos_embd;
1757
  struct ggml_tensor * tok_norm;
1758
  struct ggml_tensor * tok_norm_b;
 
1872
  // memory buffers used to evaluate the model
1873
  std::vector<uint8_t> buf_compute_meta;
1874
  ggml_backend_sched_t sched = nullptr;
 
 
1875
 
1876
  // input tensors
1877
  ggml_backend_buffer_t buf_input = nullptr;
 
1881
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1882
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1883
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1884
+ struct ggml_tensor * inp_sum; // F32 [1, n_batch]
1885
 
1886
  #ifdef GGML_USE_MPI
1887
  ggml_mpi_context * ctx_mpi = NULL;
 
2861
  switch (type) {
2862
  case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2863
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2864
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2865
  default: return "unknown";
2866
  }
2867
  }
 
3033
  default: model.type = e_model::MODEL_UNKNOWN;
3034
  }
3035
  } break;
3036
+ case LLM_ARCH_BERT:
3037
+ {
3038
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3039
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3040
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3041
+
3042
+ switch (hparams.n_layer) {
3043
+ case 3:
3044
+ model.type = e_model::MODEL_17M; break; // bge-micro
3045
+ case 6:
3046
+ model.type = e_model::MODEL_22M; break; // MiniLM-L6
3047
+ case 12:
3048
+ switch (hparams.n_embd) {
3049
+ case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
3050
+ case 768: model.type = e_model::MODEL_109M; break; // bge-base
3051
+ } break;
3052
+ case 24:
3053
+ model.type = e_model::MODEL_335M; break; // bge-large
3054
+ }
3055
+ } break;
3056
  case LLM_ARCH_BLOOM:
3057
  {
3058
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
3257
  vocab.special_unk_id = -1;
3258
  vocab.special_sep_id = -1;
3259
  vocab.special_pad_id = -1;
3260
+ } else if (tokenizer_name == "bert") {
3261
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
3262
+
3263
+ // default special tokens
3264
+ vocab.special_bos_id = 101;
3265
+ vocab.special_eos_id = 102;
3266
+ vocab.special_unk_id = 100;
3267
+ vocab.special_sep_id = -1;
3268
+ vocab.special_pad_id = -1;
3269
+ vocab.add_space_prefix = false;
3270
  } else {
3271
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
3272
  LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
 
3295
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3296
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3297
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3298
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3299
+ vocab.linefeed_id = vocab.special_pad_id;
3300
  } else {
3301
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
3302
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
 
3634
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3635
  const int64_t n_embd_gqa = n_embd_v_gqa;
3636
  const int64_t n_vocab = hparams.n_vocab;
3637
+ const int64_t n_vocab_type = hparams.n_vocab_type;
3638
  const int64_t n_ff = hparams.n_ff;
3639
 
3640
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
 
3849
  layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3850
  }
3851
  } break;
3852
+ case LLM_ARCH_BERT:
3853
  {
3854
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3855
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3856
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3857
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3858
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3859
+
3860
+ for (int i = 0; i < n_layer; ++i) {
3861
+ ggml_context * ctx_layer = ctx_for_layer(i);
3862
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3863
+
3864
+ auto & layer = model.layers[i];
3865
+
3866
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3867
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3868
+
3869
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3870
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3871
+
3872
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3873
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3874
+
3875
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3876
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3877
+
3878
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3879
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3880
+
3881
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3882
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3883
+
3884
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3885
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3886
+
3887
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3888
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3889
+ }
3890
+ } break;
3891
+ case LLM_ARCH_BLOOM:
3892
+ {
3893
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3894
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3895
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3896
 
3897
  // output
3898
  {
 
4844
  const int32_t n_orig_ctx;
4845
 
4846
  const bool do_rope_shift;
4847
+ const bool causal_attn;
4848
 
4849
  const llm_build_cb & cb;
4850
 
 
4888
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4889
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4890
  do_rope_shift (worst_case || kv_self.has_shift),
4891
+ causal_attn (hparams.causal_attn),
4892
  cb (cb),
4893
  buf_compute_meta (lctx.buf_compute_meta) {
4894
  // all initializations should be done in init()
 
5732
  return gf;
5733
  }
5734
 
5735
+ struct ggml_cgraph * build_bert() {
5736
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5737
+
5738
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5739
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5740
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5741
+
5742
+ struct ggml_tensor * cur;
5743
+ struct ggml_tensor * inpL;
5744
+
5745
+ // get input vectors with right size
5746
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5747
+ struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
5748
+
5749
+ // construct input embeddings (token, type, position)
5750
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5751
+ // token types are hardcoded to zero ("Sentence A")
5752
+ struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5753
+ inpL = ggml_add(ctx0, inpL, type_row0);
5754
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5755
+ cb(inpL, "inp_embd", -1);
5756
+
5757
+ // embed layer norm
5758
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
5759
+ cb(inpL, "inp_norm", -1);
5760
+
5761
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5762
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5763
+ cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
5764
+
5765
+ // iterate layers
5766
+ for (int il = 0; il < n_layer; ++il) {
5767
+ struct ggml_tensor * cur = inpL;
5768
+
5769
+ // self-attention
5770
+ {
5771
+ struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5772
+ cb(Qcur, "Qcur", il);
5773
+
5774
+ struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
5775
+ cb(Kcur, "Kcur", il);
5776
+
5777
+ struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
5778
+ cb(Vcur, "Vcur", il);
5779
+
5780
+ // seems like we just need to do this for Q?
5781
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5782
+
5783
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5784
+ model.layers[il].wo, model.layers[il].bo,
5785
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5786
+ cb(cur, "kqv_out", il);
5787
+ }
5788
+
5789
+ // re-add the layer input
5790
+ cur = ggml_add(ctx0, cur, inpL);
5791
+
5792
+ // attention layer norm
5793
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
5794
+
5795
+ struct ggml_tensor * ffn_inp = cur;
5796
+ cb(ffn_inp, "ffn_inp", il);
5797
+
5798
+ // feed-forward network
5799
+ cur = llm_build_ffn(ctx0, cur,
5800
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5801
+ NULL, NULL,
5802
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5803
+ NULL,
5804
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5805
+ cb(cur, "ffn_out", il);
5806
+
5807
+ // attentions bypass the intermediate layer
5808
+ cur = ggml_add(ctx0, cur, ffn_inp);
5809
+
5810
+ // output layer norm
5811
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
5812
+
5813
+ // input for next layer
5814
+ inpL = cur;
5815
+ }
5816
+
5817
+ // final output
5818
+ cur = inpL;
5819
+
5820
+ // pooling
5821
+ cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
5822
+ cb(cur, "result_embed", -1);
5823
+
5824
+ ggml_build_forward_expand(gf, cur);
5825
+
5826
+ return gf;
5827
+ }
5828
+
5829
  struct ggml_cgraph * build_bloom() {
5830
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5831
 
 
7197
 
7198
  static struct ggml_cgraph * llama_build_graph(
7199
  llama_context & lctx,
7200
+ const llama_batch & batch,
7201
+ bool worst_case) {
7202
  const auto & model = lctx.model;
7203
 
 
 
 
7204
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7205
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7206
  if (il >= 0) {
 
7221
 
7222
  struct llm_build_context llm(lctx, batch, cb, worst_case);
7223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7224
  llm.init();
7225
 
7226
  switch (model.arch) {
 
7248
  {
7249
  result = llm.build_refact();
7250
  } break;
7251
+ case LLM_ARCH_BERT:
7252
+ {
7253
+ result = llm.build_bert();
7254
+ } break;
7255
  case LLM_ARCH_BLOOM:
7256
  {
7257
  result = llm.build_bloom();
 
7309
  return result;
7310
  }
7311
 
7312
+ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7313
+ //
7314
+ // set input data
7315
+ //
7316
+
7317
+ const auto & hparams = lctx.model.hparams;
7318
+ const auto & cparams = lctx.cparams;
7319
+ const auto & kv_self = lctx.kv_self;
7320
+
7321
+ if (batch.token) {
7322
+ const int64_t n_tokens = batch.n_tokens;
7323
+
7324
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7325
+ }
7326
+
7327
+ if (batch.embd) {
7328
+ const int64_t n_embd = hparams.n_embd;
7329
+ const int64_t n_tokens = batch.n_tokens;
7330
+
7331
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7332
+ }
7333
+
7334
+ if (batch.pos) {
7335
+ const int64_t n_tokens = batch.n_tokens;
7336
+
7337
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7338
+ }
7339
+
7340
+ {
7341
+ const int64_t n_kv = kv_self.n;
7342
+ const int64_t n_tokens = batch.n_tokens;
7343
+
7344
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7345
+
7346
+ float * data = (float *) lctx.inp_KQ_mask->data;
7347
+
7348
+ for (int h = 0; h < 1; ++h) {
7349
+ for (int j = 0; j < n_tokens; ++j) {
7350
+ const llama_pos pos = batch.pos[j];
7351
+ const llama_seq_id seq_id = batch.seq_id[j][0];
7352
+
7353
+ for (int i = 0; i < n_kv; ++i) {
7354
+ float f;
7355
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7356
+ f = -INFINITY;
7357
+ } else {
7358
+ f = 0;
7359
+ }
7360
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7361
+ }
7362
+ }
7363
+ }
7364
+ }
7365
+
7366
+
7367
+ {
7368
+ assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7369
+ float * data = (float *) lctx.inp_sum->data;
7370
+
7371
+ for (int i = 0; i < batch.n_tokens; ++i) {
7372
+ data[i] = 1.0f/float(batch.n_tokens);
7373
+ }
7374
+ }
7375
+
7376
+ if (kv_self.has_shift) {
7377
+ const int64_t n_ctx = cparams.n_ctx;
7378
+
7379
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7380
+
7381
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7382
+
7383
+ for (int i = 0; i < n_ctx; ++i) {
7384
+ data[i] = lctx.kv_self.cells[i].delta;
7385
+ }
7386
+ }
7387
+ }
7388
+
7389
  // decode a batch of tokens by evaluating the transformer
7390
  //
7391
  // - lctx: llama context
 
7484
  ggml_backend_sched_reset(lctx.sched);
7485
  ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
7486
 
7487
+ ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7488
 
7489
  // the output is always the last tensor in the graph
7490
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
 
 
 
7491
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7492
+ if (strcmp(res->name, "result_output") == 0) {
7493
+ // the embeddings could be the second to last tensor, or the third to last tensor
7494
+ if (strcmp(embeddings->name, "result_norm") != 0) {
7495
+ embeddings = gf->nodes[gf->n_nodes - 3];
7496
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7497
+ }
7498
+ } else if (strcmp(res->name, "result_embed") == 0) {
7499
+ embeddings = res;
7500
+ res = nullptr;
7501
+ } else {
7502
+ GGML_ASSERT(false);
7503
  }
7504
 
7505
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
7529
  if (lctx.backend_cpu != nullptr) {
7530
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7531
  }
7532
+
7533
+ llama_set_inputs(lctx, batch);
7534
+
7535
  ggml_backend_sched_graph_compute(lctx.sched, gf);
7536
 
7537
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 
7571
  // extract logits
7572
  // TODO: do not compute and extract logits if only embeddings are needed
7573
  // need to update the graphs to skip "result_output"
7574
+ if (res) {
7575
  auto & logits_out = lctx.logits;
7576
 
7577
  #ifndef NDEBUG
 
7615
  if (!lctx.embedding.empty()) {
7616
  auto & embedding_out = lctx.embedding;
7617
 
7618
+ const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
7619
+
7620
  embedding_out.resize(n_embd);
7621
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7622
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
7623
  ggml_backend_synchronize(embeddings_backend);
7624
  }
7625
 
 
7683
  GGML_ASSERT(false);
7684
  return unicode_to_bytes_bpe(token_data.text);
7685
  }
7686
+ case LLAMA_VOCAB_TYPE_WPM: {
7687
+ GGML_ASSERT(false);
7688
+ }
7689
  default:
7690
  GGML_ASSERT(false);
7691
  }
 
7698
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7699
  return vocab.token_to_id.at(buf);
7700
  }
7701
+ case LLAMA_VOCAB_TYPE_WPM:
7702
  case LLAMA_VOCAB_TYPE_BPE: {
7703
  return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
7704
  }
 
8169
  llm_bigram_bpe::queue work_queue;
8170
  };
8171
 
8172
+ struct llm_tokenizer_wpm {
8173
+ llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
8174
+
8175
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
8176
+ auto * token_map = &vocab.token_to_id;
8177
+
8178
+ // normalize and split by whitespace
8179
+ std::vector<std::string> words = preprocess(text);
8180
+
8181
+ // bos token prepended already
8182
+
8183
+ // find the longest tokens that form the words
8184
+ for (const std::string &word : words) {
8185
+ // skip empty words
8186
+ if (word.size() == 0) {
8187
+ continue;
8188
+ }
8189
+
8190
+ // prepend phantom space
8191
+ std::string word1 = "\xe2\x96\x81" + word;
8192
+ int n = word1.size();
8193
+
8194
+ // we're at the start of a new word
8195
+ int i = 0;
8196
+ bool match_any = false;
8197
+
8198
+ // move through character position in word
8199
+ while (i < n) {
8200
+ // loop through possible match length
8201
+ bool match = false;
8202
+ for (int j = n; j > i; j--) {
8203
+ auto it = token_map->find(word1.substr(i, j - i));
8204
+ if (it != token_map->end()) {
8205
+ output.push_back(it->second);
8206
+ match = true;
8207
+ match_any = true;
8208
+ i = j;
8209
+ break;
8210
+ }
8211
+ }
8212
+
8213
+ // must be an unknown character
8214
+ if (!match) {
8215
+ i++;
8216
+ }
8217
+ }
8218
+
8219
+ // we didn't find any matches for this word
8220
+ if (!match_any) {
8221
+ output.push_back(vocab.special_unk_id);
8222
+ }
8223
+ }
8224
+
8225
+ // append eos token
8226
+ output.push_back(vocab.special_eos_id);
8227
+ }
8228
+
8229
+ std::vector<std::string> preprocess(const std::string & text) {
8230
+ std::string ori_str = normalize(text);
8231
+ uint64_t ori_size = ori_str.size();
8232
+
8233
+ // single punct / single symbol / single digit
8234
+ // baseline: add whitespace on the left and right of punct and chinese characters
8235
+ std::vector<std::string> words;
8236
+ std::string new_str = "";
8237
+ uint64_t i = 0;
8238
+ while (i < ori_size) {
8239
+ int utf_char_len = utf8_len(ori_str[i]);
8240
+ if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8241
+ new_str += " ";
8242
+ new_str += ori_str[i];
8243
+ new_str += " ";
8244
+ i += 1;
8245
+ }
8246
+ else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8247
+ new_str += " ";
8248
+ new_str += ori_str.substr(i, 3);
8249
+ new_str += " ";
8250
+ i += 3;
8251
+ }
8252
+ else {
8253
+ new_str += ori_str[i];
8254
+ i += 1;
8255
+ }
8256
+ }
8257
+
8258
+ // split by whitespace
8259
+ uint64_t l = 0;
8260
+ uint64_t r = 0;
8261
+ while (r < new_str.size()) {
8262
+ // if is whitespace
8263
+ if (isspace(new_str[r])) {
8264
+ if (r > l) words.push_back(new_str.substr(l, (r - l)));
8265
+ l = r + 1;
8266
+ r = l;
8267
+ }
8268
+ else {
8269
+ r += 1;
8270
+ }
8271
+ }
8272
+ if (r > l) {
8273
+ words.push_back(new_str.substr(l, (r - l)));
8274
+ }
8275
+ return words;
8276
+ }
8277
+
8278
+ std::string normalize(const std::string & text) {
8279
+ // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8280
+ std::string text2 = strip_accents(text);
8281
+ for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8282
+ char c = text2[i];
8283
+ if (c >= 'A' && c <= 'Z') {
8284
+ text2[i] = c - 'A' + 'a';
8285
+ }
8286
+ }
8287
+ return text2;
8288
+ }
8289
+
8290
+ bool is_chinese_char(const std::string & str) {
8291
+ int len = str.length();
8292
+ unsigned int codepoint = 0;
8293
+ int num_bytes = 0;
8294
+ int i = 0;
8295
+ unsigned char ch = static_cast<unsigned char>(str[i]);
8296
+ if (ch <= 0x7f) {
8297
+ codepoint = ch;
8298
+ num_bytes = 1;
8299
+ } else if ((ch >> 5) == 0x06) {
8300
+ codepoint = ch & 0x1f;
8301
+ num_bytes = 2;
8302
+ } else if ((ch >> 4) == 0x0e) {
8303
+ codepoint = ch & 0x0f;
8304
+ num_bytes = 3;
8305
+ } else if ((ch >> 3) == 0x1e) {
8306
+ codepoint = ch & 0x07;
8307
+ num_bytes = 4;
8308
+ }
8309
+ for (int j = 1; j < num_bytes; ++j) {
8310
+ if (i + j >= len) {
8311
+ return false; // incomplete UTF-8 character
8312
+ }
8313
+ unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8314
+ if ((next_ch >> 6) != 0x02) {
8315
+ return false; // invalid trailing byte
8316
+ }
8317
+ codepoint = (codepoint << 6) | (next_ch & 0x3f);
8318
+ }
8319
+ if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8320
+ (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8321
+ (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
8322
+ (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
8323
+ (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
8324
+ (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
8325
+ (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
8326
+ (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
8327
+ (codepoint >= 0x3000 && codepoint <= 0x303F) ||
8328
+ (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
8329
+ return true; // NOLINT
8330
+ }
8331
+ return false;
8332
+ }
8333
+
8334
+ std::string strip_accents(const std::string & input_string) {
8335
+ std::string resultString;
8336
+ std::map<std::string, char> accent_map = {
8337
+ {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8338
+ {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8339
+ {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8340
+ {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8341
+ {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8342
+ {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8343
+ {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8344
+ {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8345
+ {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8346
+ };
8347
+
8348
+ for (size_t i = 0; i < input_string.length();) {
8349
+ int len = utf8_len(input_string[i]);
8350
+ std::string curChar = input_string.substr(i, len);
8351
+ auto iter = accent_map.find(curChar);
8352
+ if (iter != accent_map.end()) {
8353
+ resultString += iter->second;
8354
+ } else {
8355
+ resultString += curChar;
8356
+ }
8357
+ i += len;
8358
+ }
8359
+
8360
+ return resultString;
8361
+ }
8362
+
8363
+ static size_t utf8_len(char src) {
8364
+ const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8365
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8366
+ return lookup[highbits];
8367
+ }
8368
+
8369
+ const llama_vocab & vocab;
8370
+ };
8371
+
8372
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
8373
  FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
8374
  FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
8375
  } FRAGMENT_BUFFER_VARIANT_TYPE;
8376
 
8377
+ struct fragment_buffer_variant {
8378
  fragment_buffer_variant(llama_vocab::id _token)
8379
  :
8380
  type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
 
8404
 
8405
  // #define PRETOKENIZERDEBUG
8406
 
8407
+ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
 
8408
  // for each special token
8409
  for (const auto & st: vocab.special_tokens_cache) {
8410
  const auto & special_token = st.first;
 
8522
  switch (vocab.type) {
8523
  case LLAMA_VOCAB_TYPE_SPM:
8524
  {
8525
+ for (const auto & fragment: fragment_buffer) {
8526
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
 
 
8527
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
8528
 
8529
  // TODO: It's likely possible to get rid of this string copy entirely
 
8543
  llm_tokenizer_spm tokenizer(vocab);
8544
  llama_escape_whitespace(raw_text);
8545
  tokenizer.tokenize(raw_text, output);
8546
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
 
 
8547
  output.push_back(fragment.token);
8548
  }
8549
  }
8550
  } break;
8551
  case LLAMA_VOCAB_TYPE_BPE:
8552
  {
8553
+ for (const auto & fragment: fragment_buffer) {
8554
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
 
 
8555
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8556
 
8557
  #ifdef PRETOKENIZERDEBUG
 
8559
  #endif
8560
  llm_tokenizer_bpe tokenizer(vocab);
8561
  tokenizer.tokenize(raw_text, output);
8562
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8563
+ output.push_back(fragment.token);
8564
  }
8565
+ }
8566
+ } break;
8567
+ case LLAMA_VOCAB_TYPE_WPM:
8568
+ {
8569
+ for (const auto & fragment: fragment_buffer) {
8570
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8571
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8572
+
8573
+ #ifdef PRETOKENIZERDEBUG
8574
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
8575
+ #endif
8576
+ llm_tokenizer_wpm tokenizer(vocab);
8577
+ tokenizer.tokenize(raw_text, output);
8578
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8579
  output.push_back(fragment.token);
8580
  }
8581
  }
 
11239
  // graph inputs
11240
  {
11241
  ggml_init_params init_params = {
11242
+ /* .mem_size */ ggml_tensor_overhead()*7,
11243
  /* .mem_buffer */ nullptr,
11244
  /* .no_alloc */ true,
11245
  };
 
11250
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11251
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11252
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11253
+ ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
11254
 
11255
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
11256
  ggml_set_name(ctx->inp_embd, "inp_embd");
11257
  ggml_set_name(ctx->inp_pos, "inp_pos");
11258
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11259
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11260
+ ggml_set_name(ctx->inp_sum, "inp_sum");
11261
 
11262
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11263
 
 
11283
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
11284
 
11285
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
 
11286
 
11287
  // build worst-case graph
11288
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
11289
  int n_past = cparams.n_ctx - n_tokens;
11290
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
11291
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
11292
 
11293
  // initialize scheduler with the worst-case graph
11294
+ if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
11295
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
11296
+ llama_free(ctx);
11297
+ return nullptr;
11298
+ }
11299
 
11300
+ for (size_t i = 0; i < ctx->backends.size(); i++) {
11301
+ ggml_backend_t backend = ctx->backends[i];
11302
+ ggml_backend_buffer_type_t buft = backend_buft[i];
11303
+ size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
11304
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
11305
+ ggml_backend_buft_name(buft),
11306
+ size / 1024.0 / 1024.0);
11307
  }
11308
 
11309
  // note: the number of splits during measure is higher than during inference due to the kv shift
 
12192
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
12193
  if (0 <= token && token < llama_n_vocab(model)) {
12194
  switch (llama_vocab_get_type(model->vocab)) {
12195
+ case LLAMA_VOCAB_TYPE_WPM:
12196
  case LLAMA_VOCAB_TYPE_SPM: {
12197
  // NOTE: we accept all unsupported token types,
12198
  // suppressing them like CONTROL tokens.
 
12316
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
12317
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
12318
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
12319
+ s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
12320
 
12321
  return s.c_str();
12322
  }
examples/talk-llama/llama.h CHANGED
@@ -61,6 +61,7 @@ extern "C" {
61
  enum llama_vocab_type {
62
  LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
63
  LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
 
64
  };
65
 
66
  enum llama_token_type {
 
61
  enum llama_vocab_type {
62
  LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
63
  LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
64
+ LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
65
  };
66
 
67
  enum llama_token_type {