Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- Makefile +2 -2
- examples/talk-llama/CMakeLists.txt +1 -1
- examples/talk-llama/llama.cpp +0 -0
- examples/talk-llama/llama.h +25 -11
- examples/talk-llama/unicode.cpp +0 -0
- examples/talk-llama/unicode.h +0 -0
Makefile
CHANGED
|
@@ -410,8 +410,8 @@ lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
|
| 410 |
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 411 |
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
|
| 412 |
|
| 413 |
-
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 414 |
-
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
|
| 415 |
|
| 416 |
#
|
| 417 |
# Audio samples
|
|
|
|
| 410 |
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 411 |
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
|
| 412 |
|
| 413 |
+
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
| 414 |
+
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
|
| 415 |
|
| 416 |
#
|
| 417 |
# Audio samples
|
examples/talk-llama/CMakeLists.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
if (WHISPER_SDL2)
|
| 2 |
# talk-llama
|
| 3 |
set(TARGET talk-llama)
|
| 4 |
-
add_executable(${TARGET} talk-llama.cpp llama.cpp)
|
| 5 |
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
| 6 |
|
| 7 |
if (WHISPER_CLBLAST)
|
|
|
|
| 1 |
if (WHISPER_SDL2)
|
| 2 |
# talk-llama
|
| 3 |
set(TARGET talk-llama)
|
| 4 |
+
add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp)
|
| 5 |
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
| 6 |
|
| 7 |
if (WHISPER_CLBLAST)
|
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -59,9 +59,10 @@ extern "C" {
|
|
| 59 |
typedef int32_t llama_seq_id;
|
| 60 |
|
| 61 |
enum llama_vocab_type {
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 65 |
};
|
| 66 |
|
| 67 |
// note: these values should be synchronized with ggml_rope
|
|
@@ -234,7 +235,9 @@ extern "C" {
|
|
| 234 |
struct llama_context_params {
|
| 235 |
uint32_t seed; // RNG seed, -1 for random
|
| 236 |
uint32_t n_ctx; // text context, 0 = from model
|
| 237 |
-
uint32_t n_batch; //
|
|
|
|
|
|
|
| 238 |
uint32_t n_threads; // number of threads to use for generation
|
| 239 |
uint32_t n_threads_batch; // number of threads to use for batch processing
|
| 240 |
|
|
@@ -277,7 +280,7 @@ extern "C" {
|
|
| 277 |
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
| 278 |
bool quantize_output_tensor; // quantize output.weight
|
| 279 |
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
| 280 |
-
bool pure; //
|
| 281 |
void * imatrix; // pointer to importance matrix data
|
| 282 |
} llama_model_quantize_params;
|
| 283 |
|
|
@@ -376,6 +379,8 @@ extern "C" {
|
|
| 376 |
|
| 377 |
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
| 378 |
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
|
|
|
|
|
| 379 |
|
| 380 |
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
| 381 |
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
|
@@ -454,7 +459,7 @@ extern "C" {
|
|
| 454 |
// Maximum number of sequences that can exist in a cell. It's not an error
|
| 455 |
// if there are more sequences in a cell than this value, however they will
|
| 456 |
// not be visible in the view cells_sequences.
|
| 457 |
-
int32_t
|
| 458 |
|
| 459 |
// Number of tokens in the cache. For example, if there are two populated
|
| 460 |
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
|
@@ -474,12 +479,12 @@ extern "C" {
|
|
| 474 |
// Information for an individual cell.
|
| 475 |
struct llama_kv_cache_view_cell * cells;
|
| 476 |
|
| 477 |
-
// The sequences for each cell. There will be
|
| 478 |
llama_seq_id * cells_sequences;
|
| 479 |
};
|
| 480 |
|
| 481 |
// Create an empty KV cache view. (use only for debugging purposes)
|
| 482 |
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t
|
| 483 |
|
| 484 |
// Free a KV cache view. (use only for debugging purposes)
|
| 485 |
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
|
@@ -502,7 +507,7 @@ extern "C" {
|
|
| 502 |
// seq_id < 0 : match any sequence
|
| 503 |
// p0 < 0 : [0, p1]
|
| 504 |
// p1 < 0 : [p0, inf)
|
| 505 |
-
LLAMA_API
|
| 506 |
struct llama_context * ctx,
|
| 507 |
llama_seq_id seq_id,
|
| 508 |
llama_pos p0,
|
|
@@ -641,9 +646,18 @@ extern "C" {
|
|
| 641 |
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
| 642 |
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
| 643 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
// Set abort callback
|
| 645 |
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
| 646 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
// Token logits obtained from the last call to llama_decode()
|
| 648 |
// The logits for the last token are stored in the last row
|
| 649 |
// Logits for which llama_batch.logits[i] == 0 are undefined
|
|
@@ -702,7 +716,7 @@ extern "C" {
|
|
| 702 |
|
| 703 |
/// @details Convert the provided text into tokens.
|
| 704 |
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
| 705 |
-
/// @return Returns the number of tokens on success, no more than
|
| 706 |
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
| 707 |
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
| 708 |
/// Does not insert a leading space.
|
|
@@ -711,7 +725,7 @@ extern "C" {
|
|
| 711 |
const char * text,
|
| 712 |
int32_t text_len,
|
| 713 |
llama_token * tokens,
|
| 714 |
-
int32_t
|
| 715 |
bool add_bos,
|
| 716 |
bool special);
|
| 717 |
|
|
|
|
| 59 |
typedef int32_t llama_seq_id;
|
| 60 |
|
| 61 |
enum llama_vocab_type {
|
| 62 |
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
| 63 |
+
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
| 64 |
+
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
| 65 |
+
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
| 66 |
};
|
| 67 |
|
| 68 |
// note: these values should be synchronized with ggml_rope
|
|
|
|
| 235 |
struct llama_context_params {
|
| 236 |
uint32_t seed; // RNG seed, -1 for random
|
| 237 |
uint32_t n_ctx; // text context, 0 = from model
|
| 238 |
+
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
| 239 |
+
uint32_t n_ubatch; // physical maximum batch size
|
| 240 |
+
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
| 241 |
uint32_t n_threads; // number of threads to use for generation
|
| 242 |
uint32_t n_threads_batch; // number of threads to use for batch processing
|
| 243 |
|
|
|
|
| 280 |
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
| 281 |
bool quantize_output_tensor; // quantize output.weight
|
| 282 |
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
| 283 |
+
bool pure; // quantize all tensors to the default type
|
| 284 |
void * imatrix; // pointer to importance matrix data
|
| 285 |
} llama_model_quantize_params;
|
| 286 |
|
|
|
|
| 379 |
|
| 380 |
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
| 381 |
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
| 382 |
+
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
| 383 |
+
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
| 384 |
|
| 385 |
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
| 386 |
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
|
|
|
| 459 |
// Maximum number of sequences that can exist in a cell. It's not an error
|
| 460 |
// if there are more sequences in a cell than this value, however they will
|
| 461 |
// not be visible in the view cells_sequences.
|
| 462 |
+
int32_t n_seq_max;
|
| 463 |
|
| 464 |
// Number of tokens in the cache. For example, if there are two populated
|
| 465 |
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
|
|
|
| 479 |
// Information for an individual cell.
|
| 480 |
struct llama_kv_cache_view_cell * cells;
|
| 481 |
|
| 482 |
+
// The sequences for each cell. There will be n_seq_max items per cell.
|
| 483 |
llama_seq_id * cells_sequences;
|
| 484 |
};
|
| 485 |
|
| 486 |
// Create an empty KV cache view. (use only for debugging purposes)
|
| 487 |
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
| 488 |
|
| 489 |
// Free a KV cache view. (use only for debugging purposes)
|
| 490 |
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
|
|
|
| 507 |
// seq_id < 0 : match any sequence
|
| 508 |
// p0 < 0 : [0, p1]
|
| 509 |
// p1 < 0 : [p0, inf)
|
| 510 |
+
LLAMA_API bool llama_kv_cache_seq_rm(
|
| 511 |
struct llama_context * ctx,
|
| 512 |
llama_seq_id seq_id,
|
| 513 |
llama_pos p0,
|
|
|
|
| 646 |
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
| 647 |
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
| 648 |
|
| 649 |
+
// Set whether to use causal attention or not
|
| 650 |
+
// If set to true, the model will only attend to the past tokens
|
| 651 |
+
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
| 652 |
+
|
| 653 |
// Set abort callback
|
| 654 |
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
| 655 |
|
| 656 |
+
// Wait until all computations are finished
|
| 657 |
+
// This is automatically done when using one of the functions below to obtain the computation results
|
| 658 |
+
// and is not necessary to call it explicitly in most cases
|
| 659 |
+
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
| 660 |
+
|
| 661 |
// Token logits obtained from the last call to llama_decode()
|
| 662 |
// The logits for the last token are stored in the last row
|
| 663 |
// Logits for which llama_batch.logits[i] == 0 are undefined
|
|
|
|
| 716 |
|
| 717 |
/// @details Convert the provided text into tokens.
|
| 718 |
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
| 719 |
+
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
| 720 |
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
| 721 |
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
| 722 |
/// Does not insert a leading space.
|
|
|
|
| 725 |
const char * text,
|
| 726 |
int32_t text_len,
|
| 727 |
llama_token * tokens,
|
| 728 |
+
int32_t n_tokens_max,
|
| 729 |
bool add_bos,
|
| 730 |
bool special);
|
| 731 |
|
examples/talk-llama/unicode.cpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/unicode.h
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|