ggerganov commited on
Commit
75c5f9c
·
unverified ·
1 Parent(s): 2ed0a44

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -118,6 +118,12 @@ extern "C" {
118
  LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
119
  };
120
 
 
 
 
 
 
 
121
  typedef struct llama_token_data {
122
  llama_token id; // token id
123
  float logit; // log-odds of the token
@@ -180,8 +186,16 @@ extern "C" {
180
 
181
  struct llama_model_params {
182
  int32_t n_gpu_layers; // number of layers to store in VRAM
183
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
184
- const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
 
 
 
 
 
 
 
185
 
186
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
187
  // If the provided progress_callback returns true, model loading continues.
 
118
  LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
119
  };
120
 
121
+ enum llama_split_mode {
122
+ LLAMA_SPLIT_NONE = 0, // single GPU
123
+ LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
124
+ LLAMA_SPLIT_ROW = 2, // split rows across GPUs
125
+ };
126
+
127
  typedef struct llama_token_data {
128
  llama_token id; // token id
129
  float logit; // log-odds of the token
 
186
 
187
  struct llama_model_params {
188
  int32_t n_gpu_layers; // number of layers to store in VRAM
189
+ enum llama_split_mode split_mode; // how to split the model across multiple GPUs
190
+
191
+ // main_gpu interpretation depends on split_mode:
192
+ // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
193
+ // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
194
+ // LLAMA_SPLIT_LAYER: ignored
195
+ int32_t main_gpu;
196
+
197
+ // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
198
+ const float * tensor_split;
199
 
200
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
201
  // If the provided progress_callback returns true, model loading continues.