ggerganov commited on
Commit
14e824b
·
unverified ·
1 Parent(s): 1701a5d

talk-llama : sync llama.cpp

Browse files
Makefile CHANGED
@@ -410,8 +410,8 @@ lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
410
  talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
411
  $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
412
 
413
- talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
414
- $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
415
 
416
  #
417
  # Audio samples
 
410
  talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
411
  $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
412
 
413
+ talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
414
+ $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
415
 
416
  #
417
  # Audio samples
examples/talk-llama/CMakeLists.txt CHANGED
@@ -1,7 +1,7 @@
1
  if (WHISPER_SDL2)
2
  # talk-llama
3
  set(TARGET talk-llama)
4
- add_executable(${TARGET} talk-llama.cpp llama.cpp)
5
  target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
6
 
7
  if (WHISPER_CLBLAST)
 
1
  if (WHISPER_SDL2)
2
  # talk-llama
3
  set(TARGET talk-llama)
4
+ add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp)
5
  target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
6
 
7
  if (WHISPER_CLBLAST)
examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -59,9 +59,10 @@ extern "C" {
59
  typedef int32_t llama_seq_id;
60
 
61
  enum llama_vocab_type {
62
- LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
63
- LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
64
- LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
 
65
  };
66
 
67
  // note: these values should be synchronized with ggml_rope
@@ -234,7 +235,9 @@ extern "C" {
234
  struct llama_context_params {
235
  uint32_t seed; // RNG seed, -1 for random
236
  uint32_t n_ctx; // text context, 0 = from model
237
- uint32_t n_batch; // prompt processing maximum batch size
 
 
238
  uint32_t n_threads; // number of threads to use for generation
239
  uint32_t n_threads_batch; // number of threads to use for batch processing
240
 
@@ -277,7 +280,7 @@ extern "C" {
277
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
278
  bool quantize_output_tensor; // quantize output.weight
279
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
280
- bool pure; // disable k-quant mixtures and quantize all tensors to the same type
281
  void * imatrix; // pointer to importance matrix data
282
  } llama_model_quantize_params;
283
 
@@ -376,6 +379,8 @@ extern "C" {
376
 
377
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
378
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
 
 
379
 
380
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
381
  LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
@@ -454,7 +459,7 @@ extern "C" {
454
  // Maximum number of sequences that can exist in a cell. It's not an error
455
  // if there are more sequences in a cell than this value, however they will
456
  // not be visible in the view cells_sequences.
457
- int32_t n_max_seq;
458
 
459
  // Number of tokens in the cache. For example, if there are two populated
460
  // cells, the first with 1 sequence id in it and the second with 2 sequence
@@ -474,12 +479,12 @@ extern "C" {
474
  // Information for an individual cell.
475
  struct llama_kv_cache_view_cell * cells;
476
 
477
- // The sequences for each cell. There will be n_max_seq items per cell.
478
  llama_seq_id * cells_sequences;
479
  };
480
 
481
  // Create an empty KV cache view. (use only for debugging purposes)
482
- LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
483
 
484
  // Free a KV cache view. (use only for debugging purposes)
485
  LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
@@ -502,7 +507,7 @@ extern "C" {
502
  // seq_id < 0 : match any sequence
503
  // p0 < 0 : [0, p1]
504
  // p1 < 0 : [p0, inf)
505
- LLAMA_API void llama_kv_cache_seq_rm(
506
  struct llama_context * ctx,
507
  llama_seq_id seq_id,
508
  llama_pos p0,
@@ -641,9 +646,18 @@ extern "C" {
641
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
642
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
643
 
 
 
 
 
644
  // Set abort callback
645
  LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
646
 
 
 
 
 
 
647
  // Token logits obtained from the last call to llama_decode()
648
  // The logits for the last token are stored in the last row
649
  // Logits for which llama_batch.logits[i] == 0 are undefined
@@ -702,7 +716,7 @@ extern "C" {
702
 
703
  /// @details Convert the provided text into tokens.
704
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
705
- /// @return Returns the number of tokens on success, no more than n_max_tokens
706
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
707
  /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
708
  /// Does not insert a leading space.
@@ -711,7 +725,7 @@ extern "C" {
711
  const char * text,
712
  int32_t text_len,
713
  llama_token * tokens,
714
- int32_t n_max_tokens,
715
  bool add_bos,
716
  bool special);
717
 
 
59
  typedef int32_t llama_seq_id;
60
 
61
  enum llama_vocab_type {
62
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
66
  };
67
 
68
  // note: these values should be synchronized with ggml_rope
 
235
  struct llama_context_params {
236
  uint32_t seed; // RNG seed, -1 for random
237
  uint32_t n_ctx; // text context, 0 = from model
238
+ uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
239
+ uint32_t n_ubatch; // physical maximum batch size
240
+ uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
241
  uint32_t n_threads; // number of threads to use for generation
242
  uint32_t n_threads_batch; // number of threads to use for batch processing
243
 
 
280
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
281
  bool quantize_output_tensor; // quantize output.weight
282
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283
+ bool pure; // quantize all tensors to the default type
284
  void * imatrix; // pointer to importance matrix data
285
  } llama_model_quantize_params;
286
 
 
379
 
380
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
381
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
382
+ LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
383
+ LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
384
 
385
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
386
  LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
 
459
  // Maximum number of sequences that can exist in a cell. It's not an error
460
  // if there are more sequences in a cell than this value, however they will
461
  // not be visible in the view cells_sequences.
462
+ int32_t n_seq_max;
463
 
464
  // Number of tokens in the cache. For example, if there are two populated
465
  // cells, the first with 1 sequence id in it and the second with 2 sequence
 
479
  // Information for an individual cell.
480
  struct llama_kv_cache_view_cell * cells;
481
 
482
+ // The sequences for each cell. There will be n_seq_max items per cell.
483
  llama_seq_id * cells_sequences;
484
  };
485
 
486
  // Create an empty KV cache view. (use only for debugging purposes)
487
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
488
 
489
  // Free a KV cache view. (use only for debugging purposes)
490
  LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
 
507
  // seq_id < 0 : match any sequence
508
  // p0 < 0 : [0, p1]
509
  // p1 < 0 : [p0, inf)
510
+ LLAMA_API bool llama_kv_cache_seq_rm(
511
  struct llama_context * ctx,
512
  llama_seq_id seq_id,
513
  llama_pos p0,
 
646
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
647
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
648
 
649
+ // Set whether to use causal attention or not
650
+ // If set to true, the model will only attend to the past tokens
651
+ LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
652
+
653
  // Set abort callback
654
  LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
655
 
656
+ // Wait until all computations are finished
657
+ // This is automatically done when using one of the functions below to obtain the computation results
658
+ // and is not necessary to call it explicitly in most cases
659
+ LLAMA_API void llama_synchronize(struct llama_context * ctx);
660
+
661
  // Token logits obtained from the last call to llama_decode()
662
  // The logits for the last token are stored in the last row
663
  // Logits for which llama_batch.logits[i] == 0 are undefined
 
716
 
717
  /// @details Convert the provided text into tokens.
718
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
719
+ /// @return Returns the number of tokens on success, no more than n_tokens_max
720
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
721
  /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
722
  /// Does not insert a leading space.
 
725
  const char * text,
726
  int32_t text_len,
727
  llama_token * tokens,
728
+ int32_t n_tokens_max,
729
  bool add_bos,
730
  bool special);
731
 
examples/talk-llama/unicode.cpp ADDED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/unicode.h CHANGED
The diff for this file is too large to render. See raw diff