ggerganov commited on
Commit
ad4065a
·
unverified ·
1 Parent(s): 1fe6df0

talk-llama : sync latest llama.cpp (close #922, close #954)

Browse files
examples/talk-llama/llama-util.h CHANGED
@@ -14,6 +14,7 @@
14
 
15
  #include <string>
16
  #include <vector>
 
17
 
18
  #ifdef __has_include
19
  #if __has_include(<unistd.h>)
@@ -74,7 +75,7 @@ struct llama_file {
74
  llama_file(const char * fname, const char * mode) {
75
  fp = std::fopen(fname, mode);
76
  if (fp == NULL) {
77
- throw format("failed to open %s: %s", fname, std::strerror(errno));
78
  }
79
  seek(0, SEEK_END);
80
  size = tell();
@@ -100,17 +101,17 @@ struct llama_file {
100
  LLAMA_ASSERT(ret == 0); // same
101
  }
102
 
103
- void read_raw(void * ptr, size_t size) {
104
- if (size == 0) {
105
  return;
106
  }
107
  errno = 0;
108
- std::size_t ret = std::fread(ptr, size, 1, fp);
109
  if (ferror(fp)) {
110
- throw format("read error: %s", strerror(errno));
111
  }
112
  if (ret != 1) {
113
- throw std::string("unexpectedly reached end of file");
114
  }
115
  }
116
 
@@ -126,14 +127,14 @@ struct llama_file {
126
  return std::string(chars.data(), len);
127
  }
128
 
129
- void write_raw(const void * ptr, size_t size) {
130
- if (size == 0) {
131
  return;
132
  }
133
  errno = 0;
134
- size_t ret = std::fwrite(ptr, size, 1, fp);
135
  if (ret != 1) {
136
- throw format("write error: %s", strerror(errno));
137
  }
138
  }
139
 
@@ -171,7 +172,7 @@ struct llama_mmap {
171
  #ifdef _POSIX_MAPPED_FILES
172
  static constexpr bool SUPPORTED = true;
173
 
174
- llama_mmap(struct llama_file * file, bool prefetch = true) {
175
  size = file->size;
176
  int fd = fileno(file->fp);
177
  int flags = MAP_SHARED;
@@ -180,12 +181,12 @@ struct llama_mmap {
180
  #endif
181
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
182
  if (addr == MAP_FAILED) {
183
- throw format("mmap failed: %s", strerror(errno));
184
  }
185
 
186
- if (prefetch) {
187
  // Advise the kernel to preload the mapped memory
188
- if (madvise(addr, file->size, MADV_WILLNEED)) {
189
  fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
190
  strerror(errno));
191
  }
@@ -207,7 +208,7 @@ struct llama_mmap {
207
  DWORD error = GetLastError();
208
 
209
  if (hMapping == NULL) {
210
- throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
211
  }
212
 
213
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -215,7 +216,7 @@ struct llama_mmap {
215
  CloseHandle(hMapping);
216
 
217
  if (addr == NULL) {
218
- throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
219
  }
220
 
221
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@@ -243,8 +244,9 @@ struct llama_mmap {
243
  #else
244
  static constexpr bool SUPPORTED = false;
245
 
246
- llama_mmap(struct llama_file *) {
247
- throw std::string("mmap not supported");
 
248
  }
249
  #endif
250
  };
@@ -265,9 +267,9 @@ struct llama_mlock {
265
  }
266
  }
267
 
268
- void init(void * addr) {
269
- LLAMA_ASSERT(this->addr == NULL && this->size == 0);
270
- this->addr = addr;
271
  }
272
 
273
  void grow_to(size_t target_size) {
@@ -338,14 +340,14 @@ struct llama_mlock {
338
  return (size_t) si.dwPageSize;
339
  }
340
 
341
- bool raw_lock(void * addr, size_t size) {
342
  for (int tries = 1; ; tries++) {
343
- if (VirtualLock(addr, size)) {
344
  return true;
345
  }
346
  if (tries == 2) {
347
  fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
348
- size, this->size, llama_format_win_err(GetLastError()).c_str());
349
  return false;
350
  }
351
 
@@ -361,7 +363,7 @@ struct llama_mlock {
361
  // is equal to the number of pages in its minimum working set minus
362
  // a small overhead."
363
  // Hopefully a megabyte is enough overhead:
364
- size_t increment = size + 1048576;
365
  // The minimum must be <= the maximum, so we need to increase both:
366
  min_ws_size += increment;
367
  max_ws_size += increment;
@@ -373,8 +375,8 @@ struct llama_mlock {
373
  }
374
  }
375
 
376
- void raw_unlock(void * addr, size_t size) {
377
- if (!VirtualUnlock(addr, size)) {
378
  fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
379
  llama_format_win_err(GetLastError()).c_str());
380
  }
@@ -382,11 +384,16 @@ struct llama_mlock {
382
  #else
383
  static constexpr bool SUPPORTED = false;
384
 
385
- void raw_lock(const void * addr, size_t size) {
 
 
 
 
386
  fprintf(stderr, "warning: mlock not supported on this system\n");
 
387
  }
388
 
389
- void raw_unlock(const void * addr, size_t size) {}
390
  #endif
391
  };
392
 
@@ -395,36 +402,70 @@ struct llama_buffer {
395
  uint8_t * addr = NULL;
396
  size_t size = 0;
397
 
398
- void resize(size_t size) {
 
 
399
  delete[] addr;
400
- addr = new uint8_t[size];
401
- this->size = size;
402
  }
403
 
404
  ~llama_buffer() {
405
  delete[] addr;
406
  }
 
 
 
 
 
 
407
  };
408
 
409
  #ifdef GGML_USE_CUBLAS
410
  #include "ggml-cuda.h"
411
  struct llama_ctx_buffer {
412
  uint8_t * addr = NULL;
 
413
  size_t size = 0;
414
 
 
 
415
  void resize(size_t size) {
 
 
 
416
  if (addr) {
417
- ggml_cuda_host_free(addr);
 
 
 
 
 
418
  }
419
- addr = (uint8_t *) ggml_cuda_host_malloc(size);
420
  this->size = size;
421
  }
422
 
423
- ~llama_ctx_buffer() {
424
  if (addr) {
425
- ggml_cuda_host_free(addr);
 
 
 
 
 
426
  }
 
427
  }
 
 
 
 
 
 
 
 
 
 
428
  };
429
  #else
430
  typedef llama_buffer llama_ctx_buffer;
 
14
 
15
  #include <string>
16
  #include <vector>
17
+ #include <stdexcept>
18
 
19
  #ifdef __has_include
20
  #if __has_include(<unistd.h>)
 
75
  llama_file(const char * fname, const char * mode) {
76
  fp = std::fopen(fname, mode);
77
  if (fp == NULL) {
78
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
79
  }
80
  seek(0, SEEK_END);
81
  size = tell();
 
101
  LLAMA_ASSERT(ret == 0); // same
102
  }
103
 
104
+ void read_raw(void * ptr, size_t len) const {
105
+ if (len == 0) {
106
  return;
107
  }
108
  errno = 0;
109
+ std::size_t ret = std::fread(ptr, len, 1, fp);
110
  if (ferror(fp)) {
111
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
112
  }
113
  if (ret != 1) {
114
+ throw std::runtime_error(std::string("unexpectedly reached end of file"));
115
  }
116
  }
117
 
 
127
  return std::string(chars.data(), len);
128
  }
129
 
130
+ void write_raw(const void * ptr, size_t len) const {
131
+ if (len == 0) {
132
  return;
133
  }
134
  errno = 0;
135
+ size_t ret = std::fwrite(ptr, len, 1, fp);
136
  if (ret != 1) {
137
+ throw std::runtime_error(format("write error: %s", strerror(errno)));
138
  }
139
  }
140
 
 
172
  #ifdef _POSIX_MAPPED_FILES
173
  static constexpr bool SUPPORTED = true;
174
 
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176
  size = file->size;
177
  int fd = fileno(file->fp);
178
  int flags = MAP_SHARED;
 
181
  #endif
182
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
183
  if (addr == MAP_FAILED) {
184
+ throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
185
  }
186
 
187
+ if (prefetch > 0) {
188
  // Advise the kernel to preload the mapped memory
189
+ if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
190
  fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
191
  strerror(errno));
192
  }
 
208
  DWORD error = GetLastError();
209
 
210
  if (hMapping == NULL) {
211
+ throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
212
  }
213
 
214
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
 
216
  CloseHandle(hMapping);
217
 
218
  if (addr == NULL) {
219
+ throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
220
  }
221
 
222
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
 
244
  #else
245
  static constexpr bool SUPPORTED = false;
246
 
247
+ llama_mmap(struct llama_file *, bool prefetch = true) {
248
+ (void)prefetch;
249
+ throw std::runtime_error(std::string("mmap not supported"));
250
  }
251
  #endif
252
  };
 
267
  }
268
  }
269
 
270
+ void init(void * ptr) {
271
+ LLAMA_ASSERT(addr == NULL && size == 0);
272
+ addr = ptr;
273
  }
274
 
275
  void grow_to(size_t target_size) {
 
340
  return (size_t) si.dwPageSize;
341
  }
342
 
343
+ bool raw_lock(void * ptr, size_t len) {
344
  for (int tries = 1; ; tries++) {
345
+ if (VirtualLock(ptr, len)) {
346
  return true;
347
  }
348
  if (tries == 2) {
349
  fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
350
+ len, size, llama_format_win_err(GetLastError()).c_str());
351
  return false;
352
  }
353
 
 
363
  // is equal to the number of pages in its minimum working set minus
364
  // a small overhead."
365
  // Hopefully a megabyte is enough overhead:
366
+ size_t increment = len + 1048576;
367
  // The minimum must be <= the maximum, so we need to increase both:
368
  min_ws_size += increment;
369
  max_ws_size += increment;
 
375
  }
376
  }
377
 
378
+ void raw_unlock(void * ptr, size_t len) {
379
+ if (!VirtualUnlock(ptr, len)) {
380
  fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
381
  llama_format_win_err(GetLastError()).c_str());
382
  }
 
384
  #else
385
  static constexpr bool SUPPORTED = false;
386
 
387
+ size_t lock_granularity() {
388
+ return (size_t) 65536;
389
+ }
390
+
391
+ bool raw_lock(const void * addr, size_t len) {
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
+ return false;
394
  }
395
 
396
+ void raw_unlock(const void * addr, size_t len) {}
397
  #endif
398
  };
399
 
 
402
  uint8_t * addr = NULL;
403
  size_t size = 0;
404
 
405
+ llama_buffer() = default;
406
+
407
+ void resize(size_t len) {
408
  delete[] addr;
409
+ addr = new uint8_t[len];
410
+ size = len;
411
  }
412
 
413
  ~llama_buffer() {
414
  delete[] addr;
415
  }
416
+
417
+ // disable copy and move
418
+ llama_buffer(const llama_buffer&) = delete;
419
+ llama_buffer(llama_buffer&&) = delete;
420
+ llama_buffer& operator=(const llama_buffer&) = delete;
421
+ llama_buffer& operator=(llama_buffer&&) = delete;
422
  };
423
 
424
  #ifdef GGML_USE_CUBLAS
425
  #include "ggml-cuda.h"
426
  struct llama_ctx_buffer {
427
  uint8_t * addr = NULL;
428
+ bool is_cuda;
429
  size_t size = 0;
430
 
431
+ llama_ctx_buffer() = default;
432
+
433
  void resize(size_t size) {
434
+ free();
435
+
436
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
437
  if (addr) {
438
+ is_cuda = true;
439
+ }
440
+ else {
441
+ // fall back to pageable memory
442
+ addr = new uint8_t[size];
443
+ is_cuda = false;
444
  }
 
445
  this->size = size;
446
  }
447
 
448
+ void free() {
449
  if (addr) {
450
+ if (is_cuda) {
451
+ ggml_cuda_host_free(addr);
452
+ }
453
+ else {
454
+ delete[] addr;
455
+ }
456
  }
457
+ addr = NULL;
458
  }
459
+
460
+ ~llama_ctx_buffer() {
461
+ free();
462
+ }
463
+
464
+ // disable copy and move
465
+ llama_ctx_buffer(const llama_ctx_buffer&) = delete;
466
+ llama_ctx_buffer(llama_ctx_buffer&&) = delete;
467
+ llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
468
+ llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
469
  };
470
  #else
471
  typedef llama_buffer llama_ctx_buffer;
examples/talk-llama/llama.cpp CHANGED
@@ -1,6 +1,7 @@
1
  // Defines fileno on msys:
2
  #ifndef _GNU_SOURCE
3
  #define _GNU_SOURCE
 
4
  #include <cstdint>
5
  #include <cstdio>
6
  #endif
@@ -45,6 +46,7 @@ enum e_model {
45
  MODEL_65B,
46
  };
47
 
 
48
  static const size_t MB = 1024*1024;
49
 
50
  // computed for n_ctx == 2048
@@ -110,7 +112,7 @@ struct llama_hparams {
110
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
111
 
112
  bool operator!=(const llama_hparams & other) const {
113
- return memcmp(this, &other, sizeof(llama_hparams));
114
  }
115
  };
116
 
@@ -406,6 +408,7 @@ enum llama_file_version {
406
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
407
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
  LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
 
409
  };
410
 
411
  struct llama_file_loader {
@@ -424,24 +427,30 @@ struct llama_file_loader {
424
  }
425
  void read_magic() {
426
  uint32_t magic = file.read_u32();
427
- uint32_t version = 0;
428
 
429
- if (magic != 'ggml') {
430
- version = file.read_u32();
 
431
  }
432
 
433
- if (magic == 'ggml' && version == 0) {
434
- file_version = LLAMA_FILE_VERSION_GGML;
435
- } else if (magic == 'ggmf' && version == 1) {
436
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
437
- } else if (magic == 'ggjt' && version == 1) {
438
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
- } else if (magic == 'ggjt' && version == 2) {
440
- file_version = LLAMA_FILE_VERSION_GGJT_V2;
441
- } else {
442
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
443
- magic, version);
 
 
 
444
  }
 
 
 
445
  }
446
  void read_hparams() {
447
  hparams.n_vocab = file.read_u32();
@@ -499,7 +508,7 @@ struct llama_file_loader {
499
 
500
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
501
  // skip to the next multiple of 32 bytes
502
- file.seek(-file.tell() & 31, SEEK_CUR);
503
  }
504
  shard.file_idx = file_idx;
505
  shard.file_off = file.tell();
@@ -574,7 +583,7 @@ struct llama_file_saver {
574
  file.write_u32(new_type);
575
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
576
  file.write_raw(tensor.name.data(), tensor.name.size());
577
- file.seek(-file.tell() & 31, SEEK_CUR);
578
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
579
  file.write_raw(new_data, new_size);
580
  }
@@ -641,7 +650,7 @@ struct llama_model_loader {
641
  }
642
  }
643
 
644
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
645
  auto it = tensors_map.name_to_idx.find(name);
646
  if (it == tensors_map.name_to_idx.end()) {
647
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -652,10 +661,10 @@ struct llama_model_loader {
652
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
653
  }
654
 
655
- return get_tensor_for(lt);
656
  }
657
 
658
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
659
  struct ggml_tensor * tensor;
660
  if (lt.ne.size() == 2) {
661
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -665,6 +674,7 @@ struct llama_model_loader {
665
  }
666
  ggml_set_name(tensor, lt.name.c_str());
667
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
 
668
  lt.ggml_tensor = tensor;
669
  num_ggml_tensors_created++;
670
  return tensor;
@@ -678,12 +688,16 @@ struct llama_model_loader {
678
 
679
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
680
  size_t data_size = 0;
 
681
  for (const llama_load_tensor & lt : tensors_map.tensors) {
682
  data_size += lt.size;
 
 
 
683
  }
684
 
685
  if (use_mmap) {
686
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
687
  if (!lmlock) {
688
  // Don't call the callback since the actual loading will be lazy
689
  // and we can't measure it.
@@ -696,6 +710,9 @@ struct llama_model_loader {
696
 
697
  size_t done_size = 0;
698
  for (llama_load_tensor & lt : tensors_map.tensors) {
 
 
 
699
  if (progress_callback) {
700
  progress_callback((float) done_size / data_size, progress_callback_user_data);
701
  }
@@ -708,9 +725,6 @@ struct llama_model_loader {
708
  lmlock->grow_to(done_size);
709
  }
710
  }
711
- if (progress_callback) {
712
- progress_callback(1.0f, progress_callback_user_data);
713
- }
714
  }
715
 
716
  void load_data_for(llama_load_tensor & lt) {
@@ -812,10 +826,9 @@ static bool kv_cache_init(
812
  struct llama_context_params llama_context_default_params() {
813
  struct llama_context_params result = {
814
  /*.n_ctx =*/ 512,
815
- /*.n_parts =*/ -1,
816
  /*.gpu_layers =*/ 0,
817
  /*.seed =*/ -1,
818
- /*.f16_kv =*/ false,
819
  /*.logits_all =*/ false,
820
  /*.vocab_only =*/ false,
821
  /*.use_mmap =*/ true,
@@ -836,6 +849,21 @@ bool llama_mlock_supported() {
836
  return llama_mlock::SUPPORTED;
837
  }
838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  //
840
  // model loading
841
  //
@@ -845,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
845
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
846
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
847
  case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
848
- case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
 
849
  }
850
 
851
  return "unknown";
@@ -925,11 +954,19 @@ static void llama_model_load_internal(
925
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
926
  }
927
 
928
- if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
929
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
930
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
931
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
932
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
 
 
 
 
 
 
 
 
933
  }
934
  }
935
 
@@ -942,27 +979,7 @@ static void llama_model_load_internal(
942
  size_t ctx_size;
943
  size_t mmapped_size;
944
  ml->calc_sizes(&ctx_size, &mmapped_size);
945
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
946
-
947
- // print memory requirements
948
- {
949
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
950
-
951
- // this is the total memory required to run the inference
952
- const size_t mem_required =
953
- ctx_size +
954
- mmapped_size +
955
- MEM_REQ_SCRATCH0().at(model.type) +
956
- MEM_REQ_SCRATCH1().at(model.type) +
957
- MEM_REQ_EVAL().at(model.type);
958
-
959
- // this is the memory required by one llama_state
960
- const size_t mem_required_state =
961
- scale*MEM_REQ_KV_SELF().at(model.type);
962
-
963
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
964
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
965
- }
966
 
967
  // create the ggml context
968
  {
@@ -984,7 +1001,14 @@ static void llama_model_load_internal(
984
  }
985
  }
986
 
 
 
 
 
 
 
987
  // prepare memory for the weights
 
988
  {
989
  const uint32_t n_embd = hparams.n_embd;
990
  const uint32_t n_layer = hparams.n_layer;
@@ -992,70 +1016,122 @@ static void llama_model_load_internal(
992
 
993
  ml->ggml_ctx = ctx;
994
 
995
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
996
- model.norm = ml->get_tensor("norm.weight", {n_embd});
997
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
 
 
 
 
 
 
 
 
 
 
 
 
 
998
 
999
  model.layers.resize(n_layer);
1000
  for (uint32_t i = 0; i < n_layer; ++i) {
 
 
1001
  auto & layer = model.layers[i];
1002
 
1003
  std::string layers_i = "layers." + std::to_string(i);
1004
 
1005
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
 
 
 
 
 
1006
 
1007
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
1008
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
1009
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
1010
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
1011
 
1012
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
 
 
1013
 
1014
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
1015
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
1016
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
 
 
 
1017
  }
1018
  }
1019
 
1020
  ml->done_getting_tensors();
1021
 
1022
- // populate `tensors_by_name`
1023
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1024
- model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1025
- }
1026
 
1027
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
1028
 
1029
- model.mapping = std::move(ml->mapping);
1030
  #ifdef GGML_USE_CUBLAS
1031
- {
1032
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1033
 
1034
  fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
 
 
 
 
 
 
 
 
1035
 
1036
- size_t vram_total = 0;
 
 
 
1037
 
1038
- for (int i = 0; i < n_gpu; ++i) {
1039
- const auto & layer = model.layers[i];
1040
 
1041
- ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1042
- ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1043
- ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1044
- ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1045
- ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1046
- ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1047
- ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
 
 
1048
  }
1049
- if (n_gpu_layers > (int) hparams.n_layer) {
1050
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1051
- ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
 
 
 
 
 
 
1052
  }
 
 
1053
 
1054
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
 
1055
  }
1056
- #else
1057
- (void) n_gpu_layers;
1058
- #endif
1059
 
1060
  // loading time will be recalculate after the first eval, so
1061
  // we take page faults deferred by mmap() into consideration
@@ -1154,10 +1230,8 @@ static bool llama_eval_internal(
1154
  {
1155
  cur = ggml_rms_norm(ctx0, inpL);
1156
 
1157
- // cur = attention_norm*cur
1158
- cur = ggml_mul(ctx0,
1159
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1160
- cur);
1161
  }
1162
 
1163
  // self-attention
@@ -1264,10 +1338,8 @@ static bool llama_eval_internal(
1264
  {
1265
  cur = ggml_rms_norm(ctx0, inpFF);
1266
 
1267
- // cur = ffn_norm*cur
1268
- cur = ggml_mul(ctx0,
1269
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1270
- cur);
1271
  }
1272
 
1273
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1304,10 +1376,8 @@ static bool llama_eval_internal(
1304
 
1305
  inpL = ggml_rms_norm(ctx0, inpL);
1306
 
1307
- // inpL = norm*inpL
1308
- inpL = ggml_mul(ctx0,
1309
- ggml_repeat(ctx0, model.norm, inpL),
1310
- inpL);
1311
 
1312
  embeddings = inpL;
1313
  }
@@ -2131,7 +2201,7 @@ struct llama_context * llama_init_from_file(
2131
  unsigned * cur_percentage_p = (unsigned *) ctx;
2132
  unsigned percentage = (unsigned) (100 * progress);
2133
  while (percentage > *cur_percentage_p) {
2134
- ++*cur_percentage_p;
2135
  fprintf(stderr, ".");
2136
  fflush(stderr);
2137
  if (percentage >= 100) {
@@ -2224,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2224
  {
2225
  uint32_t magic;
2226
  fin.read((char *) &magic, sizeof(magic));
2227
- if (magic != 'ggla') {
2228
  fprintf(stderr, "%s: bad file magic\n", __func__);
2229
  return 1;
2230
  }
@@ -2288,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2288
 
2289
  // maybe this should in llama_model_loader
2290
  if (model_loader->use_mmap) {
2291
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2292
  }
2293
  }
2294
 
@@ -2381,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2381
  }
2382
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2383
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2384
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2385
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2386
  model_loader->load_data_for(lt);
2387
  lt.ggml_tensor->data = lt.data;
@@ -2607,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2607
  }
2608
 
2609
  // Sets the state reading from the specified source address
2610
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2611
- const uint8_t * inp = src;
2612
 
2613
  // set rng
2614
  {
 
1
  // Defines fileno on msys:
2
  #ifndef _GNU_SOURCE
3
  #define _GNU_SOURCE
4
+ #include <cstddef>
5
  #include <cstdint>
6
  #include <cstdio>
7
  #endif
 
46
  MODEL_65B,
47
  };
48
 
49
+
50
  static const size_t MB = 1024*1024;
51
 
52
  // computed for n_ctx == 2048
 
112
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
113
 
114
  bool operator!=(const llama_hparams & other) const {
115
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
116
  }
117
  };
118
 
 
408
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
409
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
410
  LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
411
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
412
  };
413
 
414
  struct llama_file_loader {
 
427
  }
428
  void read_magic() {
429
  uint32_t magic = file.read_u32();
 
430
 
431
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
432
+ file_version = LLAMA_FILE_VERSION_GGML;
433
+ return;
434
  }
435
 
436
+ uint32_t version = file.read_u32();
437
+
438
+ switch (magic) {
439
+ case LLAMA_FILE_MAGIC_GGMF:
440
+ switch (version) {
441
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
442
+ }
443
+ break;
444
+ case LLAMA_FILE_MAGIC_GGJT:
445
+ switch (version) {
446
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
447
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
448
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
449
+ }
450
  }
451
+
452
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
453
+ magic, version);
454
  }
455
  void read_hparams() {
456
  hparams.n_vocab = file.read_u32();
 
508
 
509
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
510
  // skip to the next multiple of 32 bytes
511
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
512
  }
513
  shard.file_idx = file_idx;
514
  shard.file_off = file.tell();
 
583
  file.write_u32(new_type);
584
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
585
  file.write_raw(tensor.name.data(), tensor.name.size());
586
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
587
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
588
  file.write_raw(new_data, new_size);
589
  }
 
650
  }
651
  }
652
 
653
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
654
  auto it = tensors_map.name_to_idx.find(name);
655
  if (it == tensors_map.name_to_idx.end()) {
656
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
 
661
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
662
  }
663
 
664
+ return get_tensor_for(lt, backend);
665
  }
666
 
667
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
668
  struct ggml_tensor * tensor;
669
  if (lt.ne.size() == 2) {
670
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
 
674
  }
675
  ggml_set_name(tensor, lt.name.c_str());
676
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
677
+ tensor->backend = backend;
678
  lt.ggml_tensor = tensor;
679
  num_ggml_tensors_created++;
680
  return tensor;
 
688
 
689
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
690
  size_t data_size = 0;
691
+ size_t prefetch_size = 0;
692
  for (const llama_load_tensor & lt : tensors_map.tensors) {
693
  data_size += lt.size;
694
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
695
+ prefetch_size += lt.size;
696
+ }
697
  }
698
 
699
  if (use_mmap) {
700
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
701
  if (!lmlock) {
702
  // Don't call the callback since the actual loading will be lazy
703
  // and we can't measure it.
 
710
 
711
  size_t done_size = 0;
712
  for (llama_load_tensor & lt : tensors_map.tensors) {
713
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
714
+ continue;
715
+ }
716
  if (progress_callback) {
717
  progress_callback((float) done_size / data_size, progress_callback_user_data);
718
  }
 
725
  lmlock->grow_to(done_size);
726
  }
727
  }
 
 
 
728
  }
729
 
730
  void load_data_for(llama_load_tensor & lt) {
 
826
  struct llama_context_params llama_context_default_params() {
827
  struct llama_context_params result = {
828
  /*.n_ctx =*/ 512,
 
829
  /*.gpu_layers =*/ 0,
830
  /*.seed =*/ -1,
831
+ /*.f16_kv =*/ true,
832
  /*.logits_all =*/ false,
833
  /*.vocab_only =*/ false,
834
  /*.use_mmap =*/ true,
 
849
  return llama_mlock::SUPPORTED;
850
  }
851
 
852
+ void llama_init_backend() {
853
+ ggml_time_init();
854
+
855
+ // needed to initialize f16 tables
856
+ {
857
+ struct ggml_init_params params = { 0, NULL, false };
858
+ struct ggml_context * ctx = ggml_init(params);
859
+ ggml_free(ctx);
860
+ }
861
+ }
862
+
863
+ int64_t llama_time_us() {
864
+ return ggml_time_us();
865
+ }
866
+
867
  //
868
  // model loading
869
  //
 
873
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
874
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
875
  case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
876
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
877
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
878
  }
879
 
880
  return "unknown";
 
954
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
955
  }
956
 
957
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
958
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
959
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
960
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
961
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
962
+ }
963
+ }
964
+
965
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
966
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
967
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
968
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
969
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
970
  }
971
  }
972
 
 
979
  size_t ctx_size;
980
  size_t mmapped_size;
981
  ml->calc_sizes(&ctx_size, &mmapped_size);
982
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983
 
984
  // create the ggml context
985
  {
 
1001
  }
1002
  }
1003
 
1004
+ #ifdef GGML_USE_CUBLAS
1005
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1006
+ #else
1007
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1008
+ #endif
1009
+
1010
  // prepare memory for the weights
1011
+ size_t vram_total = 0;
1012
  {
1013
  const uint32_t n_embd = hparams.n_embd;
1014
  const uint32_t n_layer = hparams.n_layer;
 
1016
 
1017
  ml->ggml_ctx = ctx;
1018
 
1019
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1021
+
1022
+ // "output" tensor
1023
+ {
1024
+ ggml_backend backend_output;
1025
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
1026
+ backend_output = LLAMA_BACKEND_OFFLOAD;
1027
+ } else {
1028
+ backend_output = GGML_BACKEND_CPU;
1029
+ }
1030
+
1031
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1032
+ }
1033
+
1034
+ const int i_gpu_start = n_layer - n_gpu_layers;
1035
 
1036
  model.layers.resize(n_layer);
1037
  for (uint32_t i = 0; i < n_layer; ++i) {
1038
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1039
+
1040
  auto & layer = model.layers[i];
1041
 
1042
  std::string layers_i = "layers." + std::to_string(i);
1043
 
1044
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1045
+
1046
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1047
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1048
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1049
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1050
 
1051
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
 
 
 
1052
 
1053
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1054
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1055
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1056
 
1057
+ if (backend == GGML_BACKEND_CUDA) {
1058
+ vram_total +=
1059
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1060
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1061
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1062
+ }
1063
  }
1064
  }
1065
 
1066
  ml->done_getting_tensors();
1067
 
1068
+ // print memory requirements
1069
+ {
1070
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
 
1071
 
1072
+ // this is the total memory required to run the inference
1073
+ const size_t mem_required =
1074
+ ctx_size +
1075
+ mmapped_size - vram_total + // weights in VRAM not in memory
1076
+ MEM_REQ_SCRATCH0().at(model.type) +
1077
+ MEM_REQ_SCRATCH1().at(model.type) +
1078
+ MEM_REQ_EVAL().at(model.type);
1079
+
1080
+ // this is the memory required by one llama_state
1081
+ const size_t mem_required_state =
1082
+ scale*MEM_REQ_KV_SELF().at(model.type);
1083
+
1084
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1085
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1086
 
 
1087
  #ifdef GGML_USE_CUBLAS
 
1088
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1089
 
1090
  fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1091
+ if (n_gpu_layers > (int) hparams.n_layer) {
1092
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
+ }
1094
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
+ #else
1096
+ (void) n_gpu_layers;
1097
+ #endif
1098
+ }
1099
 
1100
+ // populate `tensors_by_name`
1101
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1102
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1103
+ }
1104
 
1105
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
 
1106
 
1107
+ #ifdef GGML_USE_CUBLAS
1108
+ {
1109
+ size_t done_size = 0;
1110
+ size_t data_size = 0;
1111
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1112
+ data_size += lt.size;
1113
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1114
+ done_size += lt.size;
1115
+ }
1116
  }
1117
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1118
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1119
+ continue;
1120
+ }
1121
+ if (progress_callback) {
1122
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1123
+ }
1124
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1125
+ done_size += lt.size;
1126
  }
1127
+ }
1128
+ #endif // GGML_USE_CUBLAS
1129
 
1130
+ if (progress_callback) {
1131
+ progress_callback(1.0f, progress_callback_user_data);
1132
  }
1133
+
1134
+ model.mapping = std::move(ml->mapping);
 
1135
 
1136
  // loading time will be recalculate after the first eval, so
1137
  // we take page faults deferred by mmap() into consideration
 
1230
  {
1231
  cur = ggml_rms_norm(ctx0, inpL);
1232
 
1233
+ // cur = cur*attention_norm(broadcasted)
1234
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
 
 
1235
  }
1236
 
1237
  // self-attention
 
1338
  {
1339
  cur = ggml_rms_norm(ctx0, inpFF);
1340
 
1341
+ // cur = cur*ffn_norm(broadcasted)
1342
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
 
 
1343
  }
1344
 
1345
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
 
1376
 
1377
  inpL = ggml_rms_norm(ctx0, inpL);
1378
 
1379
+ // inpL = inpL*norm(broadcasted)
1380
+ inpL = ggml_mul(ctx0, inpL, model.norm);
 
 
1381
 
1382
  embeddings = inpL;
1383
  }
 
2201
  unsigned * cur_percentage_p = (unsigned *) ctx;
2202
  unsigned percentage = (unsigned) (100 * progress);
2203
  while (percentage > *cur_percentage_p) {
2204
+ *cur_percentage_p = percentage;
2205
  fprintf(stderr, ".");
2206
  fflush(stderr);
2207
  if (percentage >= 100) {
 
2294
  {
2295
  uint32_t magic;
2296
  fin.read((char *) &magic, sizeof(magic));
2297
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
2298
  fprintf(stderr, "%s: bad file magic\n", __func__);
2299
  return 1;
2300
  }
 
2358
 
2359
  // maybe this should in llama_model_loader
2360
  if (model_loader->use_mmap) {
2361
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2362
  }
2363
  }
2364
 
 
2451
  }
2452
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2453
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2454
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
2455
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2456
  model_loader->load_data_for(lt);
2457
  lt.ggml_tensor->data = lt.data;
 
2677
  }
2678
 
2679
  // Sets the state reading from the specified source address
2680
+ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2681
+ uint8_t * inp = src;
2682
 
2683
  // set rng
2684
  {
examples/talk-llama/llama.h CHANGED
@@ -19,10 +19,16 @@
19
  # define LLAMA_API
20
  #endif
21
 
22
- #define LLAMA_FILE_VERSION 2
23
- #define LLAMA_FILE_MAGIC 'ggjt'
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
- #define LLAMA_SESSION_MAGIC 'ggsn'
 
 
 
 
 
 
26
  #define LLAMA_SESSION_VERSION 1
27
 
28
  #ifdef __cplusplus
@@ -40,9 +46,9 @@ extern "C" {
40
  typedef int llama_token;
41
 
42
  typedef struct llama_token_data {
43
- llama_token id; // token id
44
- float logit; // log-odds of the token
45
- float p; // probability of the token
46
  } llama_token_data;
47
 
48
  typedef struct llama_token_data_array {
@@ -55,7 +61,6 @@ extern "C" {
55
 
56
  struct llama_context_params {
57
  int n_ctx; // text context
58
- int n_parts; // -1 for default
59
  int n_gpu_layers; // number of layers to store in VRAM
60
  int seed; // RNG seed, -1 for random
61
 
@@ -74,16 +79,16 @@ extern "C" {
74
 
75
  // model file types
76
  enum llama_ftype {
77
- LLAMA_FTYPE_ALL_F32 = 0,
78
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
79
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
80
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
81
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
82
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
83
- // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
84
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
85
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
86
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
87
  };
88
 
89
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,6 +96,13 @@ extern "C" {
91
  LLAMA_API bool llama_mmap_supported();
92
  LLAMA_API bool llama_mlock_supported();
93
 
 
 
 
 
 
 
 
94
  // Various functions for loading a ggml llama model.
95
  // Allocate (almost) all memory needed for the model.
96
  // Return NULL on failure
@@ -139,7 +151,7 @@ extern "C" {
139
 
140
  // Set the state reading from the specified address
141
  // Returns the number of bytes read
142
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
143
 
144
  // Save/load session file
145
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
 
19
  # define LLAMA_API
20
  #endif
21
 
22
+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
23
+ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
24
+ #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
25
+ #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
26
+ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
27
+
28
+ #define LLAMA_FILE_VERSION 3
29
+ #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
30
+ #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
31
+ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
  #define LLAMA_SESSION_VERSION 1
33
 
34
  #ifdef __cplusplus
 
46
  typedef int llama_token;
47
 
48
  typedef struct llama_token_data {
49
+ llama_token id; // token id
50
+ float logit; // log-odds of the token
51
+ float p; // probability of the token
52
  } llama_token_data;
53
 
54
  typedef struct llama_token_data_array {
 
61
 
62
  struct llama_context_params {
63
  int n_ctx; // text context
 
64
  int n_gpu_layers; // number of layers to store in VRAM
65
  int seed; // RNG seed, -1 for random
66
 
 
79
 
80
  // model file types
81
  enum llama_ftype {
82
+ LLAMA_FTYPE_ALL_F32 = 0,
83
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
84
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
85
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
86
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
87
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
88
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
89
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
91
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
92
  };
93
 
94
  LLAMA_API struct llama_context_params llama_context_default_params();
 
96
  LLAMA_API bool llama_mmap_supported();
97
  LLAMA_API bool llama_mlock_supported();
98
 
99
+ // TODO: not great API - very likely to change
100
+ // Initialize the llama + ggml backend
101
+ // Call once at the start of the program
102
+ LLAMA_API void llama_init_backend();
103
+
104
+ LLAMA_API int64_t llama_time_us();
105
+
106
  // Various functions for loading a ggml llama model.
107
  // Allocate (almost) all memory needed for the model.
108
  // Return NULL on failure
 
151
 
152
  // Set the state reading from the specified address
153
  // Returns the number of bytes read
154
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
155
 
156
  // Save/load session file
157
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
examples/talk-llama/talk-llama.cpp CHANGED
@@ -33,8 +33,6 @@ struct whisper_params {
33
  int32_t max_tokens = 32;
34
  int32_t audio_ctx = 0;
35
 
36
- int32_t n_parts_llama = -1;
37
-
38
  float vad_thold = 0.6f;
39
  float freq_thold = 100.0f;
40
 
@@ -72,7 +70,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
72
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
73
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
74
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
75
- else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
76
  else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
77
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
78
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
@@ -123,7 +120,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
123
  fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
124
  fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
125
  fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
126
- fprintf(stderr, " --n-parts-llama N [%-7d] num parts in llama model file\n", params.n_parts_llama);
127
  fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
128
  fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
129
  fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
@@ -239,13 +235,14 @@ int main(int argc, char ** argv) {
239
 
240
  // llama init
241
 
 
 
242
  auto lparams = llama_context_default_params();
243
 
244
  // tune these to your liking
245
  lparams.n_ctx = 2048;
246
  lparams.seed = 1;
247
  lparams.f16_kv = true;
248
- lparams.n_parts = params.n_parts_llama;
249
 
250
  struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
251
 
 
33
  int32_t max_tokens = 32;
34
  int32_t audio_ctx = 0;
35
 
 
 
36
  float vad_thold = 0.6f;
37
  float freq_thold = 100.0f;
38
 
 
70
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
71
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
72
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
 
73
  else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
74
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
75
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
 
120
  fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
121
  fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
122
  fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
 
123
  fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
124
  fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
125
  fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
 
235
 
236
  // llama init
237
 
238
+ llama_init_backend();
239
+
240
  auto lparams = llama_context_default_params();
241
 
242
  // tune these to your liking
243
  lparams.n_ctx = 2048;
244
  lparams.seed = 1;
245
  lparams.f16_kv = true;
 
246
 
247
  struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
248