Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +0 -0
- examples/talk-llama/llama.h +16 -2
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -118,6 +118,12 @@ extern "C" {
|
|
| 118 |
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
| 119 |
};
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
typedef struct llama_token_data {
|
| 122 |
llama_token id; // token id
|
| 123 |
float logit; // log-odds of the token
|
|
@@ -180,8 +186,16 @@ extern "C" {
|
|
| 180 |
|
| 181 |
struct llama_model_params {
|
| 182 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
| 187 |
// If the provided progress_callback returns true, model loading continues.
|
|
|
|
| 118 |
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
| 119 |
};
|
| 120 |
|
| 121 |
+
enum llama_split_mode {
|
| 122 |
+
LLAMA_SPLIT_NONE = 0, // single GPU
|
| 123 |
+
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
| 124 |
+
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
| 125 |
+
};
|
| 126 |
+
|
| 127 |
typedef struct llama_token_data {
|
| 128 |
llama_token id; // token id
|
| 129 |
float logit; // log-odds of the token
|
|
|
|
| 186 |
|
| 187 |
struct llama_model_params {
|
| 188 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 189 |
+
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
| 190 |
+
|
| 191 |
+
// main_gpu interpretation depends on split_mode:
|
| 192 |
+
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
| 193 |
+
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
| 194 |
+
// LLAMA_SPLIT_LAYER: ignored
|
| 195 |
+
int32_t main_gpu;
|
| 196 |
+
|
| 197 |
+
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
| 198 |
+
const float * tensor_split;
|
| 199 |
|
| 200 |
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
| 201 |
// If the provided progress_callback returns true, model loading continues.
|