Spaces:
Running
Running
talk-llama : sync latest llama.cpp (close #922, close #954)
Browse files- examples/talk-llama/llama-util.h +77 -36
- examples/talk-llama/llama.cpp +174 -104
- examples/talk-llama/llama.h +30 -18
- examples/talk-llama/talk-llama.cpp +2 -5
examples/talk-llama/llama-util.h
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
|
| 15 |
#include <string>
|
| 16 |
#include <vector>
|
|
|
|
| 17 |
|
| 18 |
#ifdef __has_include
|
| 19 |
#if __has_include(<unistd.h>)
|
|
@@ -74,7 +75,7 @@ struct llama_file {
|
|
| 74 |
llama_file(const char * fname, const char * mode) {
|
| 75 |
fp = std::fopen(fname, mode);
|
| 76 |
if (fp == NULL) {
|
| 77 |
-
throw format("failed to open %s: %s", fname,
|
| 78 |
}
|
| 79 |
seek(0, SEEK_END);
|
| 80 |
size = tell();
|
|
@@ -100,17 +101,17 @@ struct llama_file {
|
|
| 100 |
LLAMA_ASSERT(ret == 0); // same
|
| 101 |
}
|
| 102 |
|
| 103 |
-
void read_raw(void * ptr, size_t
|
| 104 |
-
if (
|
| 105 |
return;
|
| 106 |
}
|
| 107 |
errno = 0;
|
| 108 |
-
std::size_t ret = std::fread(ptr,
|
| 109 |
if (ferror(fp)) {
|
| 110 |
-
throw format("read error: %s", strerror(errno));
|
| 111 |
}
|
| 112 |
if (ret != 1) {
|
| 113 |
-
throw std::string("unexpectedly reached end of file");
|
| 114 |
}
|
| 115 |
}
|
| 116 |
|
|
@@ -126,14 +127,14 @@ struct llama_file {
|
|
| 126 |
return std::string(chars.data(), len);
|
| 127 |
}
|
| 128 |
|
| 129 |
-
void write_raw(const void * ptr, size_t
|
| 130 |
-
if (
|
| 131 |
return;
|
| 132 |
}
|
| 133 |
errno = 0;
|
| 134 |
-
size_t ret = std::fwrite(ptr,
|
| 135 |
if (ret != 1) {
|
| 136 |
-
throw format("write error: %s", strerror(errno));
|
| 137 |
}
|
| 138 |
}
|
| 139 |
|
|
@@ -171,7 +172,7 @@ struct llama_mmap {
|
|
| 171 |
#ifdef _POSIX_MAPPED_FILES
|
| 172 |
static constexpr bool SUPPORTED = true;
|
| 173 |
|
| 174 |
-
llama_mmap(struct llama_file * file,
|
| 175 |
size = file->size;
|
| 176 |
int fd = fileno(file->fp);
|
| 177 |
int flags = MAP_SHARED;
|
|
@@ -180,12 +181,12 @@ struct llama_mmap {
|
|
| 180 |
#endif
|
| 181 |
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
| 182 |
if (addr == MAP_FAILED) {
|
| 183 |
-
throw format("mmap failed: %s", strerror(errno));
|
| 184 |
}
|
| 185 |
|
| 186 |
-
if (prefetch) {
|
| 187 |
// Advise the kernel to preload the mapped memory
|
| 188 |
-
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
| 189 |
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
| 190 |
strerror(errno));
|
| 191 |
}
|
|
@@ -207,7 +208,7 @@ struct llama_mmap {
|
|
| 207 |
DWORD error = GetLastError();
|
| 208 |
|
| 209 |
if (hMapping == NULL) {
|
| 210 |
-
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
| 211 |
}
|
| 212 |
|
| 213 |
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
|
@@ -215,7 +216,7 @@ struct llama_mmap {
|
|
| 215 |
CloseHandle(hMapping);
|
| 216 |
|
| 217 |
if (addr == NULL) {
|
| 218 |
-
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
| 219 |
}
|
| 220 |
|
| 221 |
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
@@ -243,8 +244,9 @@ struct llama_mmap {
|
|
| 243 |
#else
|
| 244 |
static constexpr bool SUPPORTED = false;
|
| 245 |
|
| 246 |
-
llama_mmap(struct llama_file
|
| 247 |
-
|
|
|
|
| 248 |
}
|
| 249 |
#endif
|
| 250 |
};
|
|
@@ -265,9 +267,9 @@ struct llama_mlock {
|
|
| 265 |
}
|
| 266 |
}
|
| 267 |
|
| 268 |
-
void init(void *
|
| 269 |
-
LLAMA_ASSERT(
|
| 270 |
-
|
| 271 |
}
|
| 272 |
|
| 273 |
void grow_to(size_t target_size) {
|
|
@@ -338,14 +340,14 @@ struct llama_mlock {
|
|
| 338 |
return (size_t) si.dwPageSize;
|
| 339 |
}
|
| 340 |
|
| 341 |
-
bool raw_lock(void *
|
| 342 |
for (int tries = 1; ; tries++) {
|
| 343 |
-
if (VirtualLock(
|
| 344 |
return true;
|
| 345 |
}
|
| 346 |
if (tries == 2) {
|
| 347 |
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
| 348 |
-
|
| 349 |
return false;
|
| 350 |
}
|
| 351 |
|
|
@@ -361,7 +363,7 @@ struct llama_mlock {
|
|
| 361 |
// is equal to the number of pages in its minimum working set minus
|
| 362 |
// a small overhead."
|
| 363 |
// Hopefully a megabyte is enough overhead:
|
| 364 |
-
size_t increment =
|
| 365 |
// The minimum must be <= the maximum, so we need to increase both:
|
| 366 |
min_ws_size += increment;
|
| 367 |
max_ws_size += increment;
|
|
@@ -373,8 +375,8 @@ struct llama_mlock {
|
|
| 373 |
}
|
| 374 |
}
|
| 375 |
|
| 376 |
-
void raw_unlock(void *
|
| 377 |
-
if (!VirtualUnlock(
|
| 378 |
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
| 379 |
llama_format_win_err(GetLastError()).c_str());
|
| 380 |
}
|
|
@@ -382,11 +384,16 @@ struct llama_mlock {
|
|
| 382 |
#else
|
| 383 |
static constexpr bool SUPPORTED = false;
|
| 384 |
|
| 385 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
fprintf(stderr, "warning: mlock not supported on this system\n");
|
|
|
|
| 387 |
}
|
| 388 |
|
| 389 |
-
void raw_unlock(const void * addr, size_t
|
| 390 |
#endif
|
| 391 |
};
|
| 392 |
|
|
@@ -395,36 +402,70 @@ struct llama_buffer {
|
|
| 395 |
uint8_t * addr = NULL;
|
| 396 |
size_t size = 0;
|
| 397 |
|
| 398 |
-
|
|
|
|
|
|
|
| 399 |
delete[] addr;
|
| 400 |
-
addr = new uint8_t[
|
| 401 |
-
|
| 402 |
}
|
| 403 |
|
| 404 |
~llama_buffer() {
|
| 405 |
delete[] addr;
|
| 406 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
};
|
| 408 |
|
| 409 |
#ifdef GGML_USE_CUBLAS
|
| 410 |
#include "ggml-cuda.h"
|
| 411 |
struct llama_ctx_buffer {
|
| 412 |
uint8_t * addr = NULL;
|
|
|
|
| 413 |
size_t size = 0;
|
| 414 |
|
|
|
|
|
|
|
| 415 |
void resize(size_t size) {
|
|
|
|
|
|
|
|
|
|
| 416 |
if (addr) {
|
| 417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
}
|
| 419 |
-
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
| 420 |
this->size = size;
|
| 421 |
}
|
| 422 |
|
| 423 |
-
|
| 424 |
if (addr) {
|
| 425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
}
|
|
|
|
| 427 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
};
|
| 429 |
#else
|
| 430 |
typedef llama_buffer llama_ctx_buffer;
|
|
|
|
| 14 |
|
| 15 |
#include <string>
|
| 16 |
#include <vector>
|
| 17 |
+
#include <stdexcept>
|
| 18 |
|
| 19 |
#ifdef __has_include
|
| 20 |
#if __has_include(<unistd.h>)
|
|
|
|
| 75 |
llama_file(const char * fname, const char * mode) {
|
| 76 |
fp = std::fopen(fname, mode);
|
| 77 |
if (fp == NULL) {
|
| 78 |
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
| 79 |
}
|
| 80 |
seek(0, SEEK_END);
|
| 81 |
size = tell();
|
|
|
|
| 101 |
LLAMA_ASSERT(ret == 0); // same
|
| 102 |
}
|
| 103 |
|
| 104 |
+
void read_raw(void * ptr, size_t len) const {
|
| 105 |
+
if (len == 0) {
|
| 106 |
return;
|
| 107 |
}
|
| 108 |
errno = 0;
|
| 109 |
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
| 110 |
if (ferror(fp)) {
|
| 111 |
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
| 112 |
}
|
| 113 |
if (ret != 1) {
|
| 114 |
+
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
| 115 |
}
|
| 116 |
}
|
| 117 |
|
|
|
|
| 127 |
return std::string(chars.data(), len);
|
| 128 |
}
|
| 129 |
|
| 130 |
+
void write_raw(const void * ptr, size_t len) const {
|
| 131 |
+
if (len == 0) {
|
| 132 |
return;
|
| 133 |
}
|
| 134 |
errno = 0;
|
| 135 |
+
size_t ret = std::fwrite(ptr, len, 1, fp);
|
| 136 |
if (ret != 1) {
|
| 137 |
+
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
| 138 |
}
|
| 139 |
}
|
| 140 |
|
|
|
|
| 172 |
#ifdef _POSIX_MAPPED_FILES
|
| 173 |
static constexpr bool SUPPORTED = true;
|
| 174 |
|
| 175 |
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
| 176 |
size = file->size;
|
| 177 |
int fd = fileno(file->fp);
|
| 178 |
int flags = MAP_SHARED;
|
|
|
|
| 181 |
#endif
|
| 182 |
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
| 183 |
if (addr == MAP_FAILED) {
|
| 184 |
+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
| 185 |
}
|
| 186 |
|
| 187 |
+
if (prefetch > 0) {
|
| 188 |
// Advise the kernel to preload the mapped memory
|
| 189 |
+
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
| 190 |
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
| 191 |
strerror(errno));
|
| 192 |
}
|
|
|
|
| 208 |
DWORD error = GetLastError();
|
| 209 |
|
| 210 |
if (hMapping == NULL) {
|
| 211 |
+
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
| 212 |
}
|
| 213 |
|
| 214 |
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
|
|
|
| 216 |
CloseHandle(hMapping);
|
| 217 |
|
| 218 |
if (addr == NULL) {
|
| 219 |
+
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
| 220 |
}
|
| 221 |
|
| 222 |
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
|
|
| 244 |
#else
|
| 245 |
static constexpr bool SUPPORTED = false;
|
| 246 |
|
| 247 |
+
llama_mmap(struct llama_file *, bool prefetch = true) {
|
| 248 |
+
(void)prefetch;
|
| 249 |
+
throw std::runtime_error(std::string("mmap not supported"));
|
| 250 |
}
|
| 251 |
#endif
|
| 252 |
};
|
|
|
|
| 267 |
}
|
| 268 |
}
|
| 269 |
|
| 270 |
+
void init(void * ptr) {
|
| 271 |
+
LLAMA_ASSERT(addr == NULL && size == 0);
|
| 272 |
+
addr = ptr;
|
| 273 |
}
|
| 274 |
|
| 275 |
void grow_to(size_t target_size) {
|
|
|
|
| 340 |
return (size_t) si.dwPageSize;
|
| 341 |
}
|
| 342 |
|
| 343 |
+
bool raw_lock(void * ptr, size_t len) {
|
| 344 |
for (int tries = 1; ; tries++) {
|
| 345 |
+
if (VirtualLock(ptr, len)) {
|
| 346 |
return true;
|
| 347 |
}
|
| 348 |
if (tries == 2) {
|
| 349 |
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
| 350 |
+
len, size, llama_format_win_err(GetLastError()).c_str());
|
| 351 |
return false;
|
| 352 |
}
|
| 353 |
|
|
|
|
| 363 |
// is equal to the number of pages in its minimum working set minus
|
| 364 |
// a small overhead."
|
| 365 |
// Hopefully a megabyte is enough overhead:
|
| 366 |
+
size_t increment = len + 1048576;
|
| 367 |
// The minimum must be <= the maximum, so we need to increase both:
|
| 368 |
min_ws_size += increment;
|
| 369 |
max_ws_size += increment;
|
|
|
|
| 375 |
}
|
| 376 |
}
|
| 377 |
|
| 378 |
+
void raw_unlock(void * ptr, size_t len) {
|
| 379 |
+
if (!VirtualUnlock(ptr, len)) {
|
| 380 |
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
| 381 |
llama_format_win_err(GetLastError()).c_str());
|
| 382 |
}
|
|
|
|
| 384 |
#else
|
| 385 |
static constexpr bool SUPPORTED = false;
|
| 386 |
|
| 387 |
+
size_t lock_granularity() {
|
| 388 |
+
return (size_t) 65536;
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
bool raw_lock(const void * addr, size_t len) {
|
| 392 |
fprintf(stderr, "warning: mlock not supported on this system\n");
|
| 393 |
+
return false;
|
| 394 |
}
|
| 395 |
|
| 396 |
+
void raw_unlock(const void * addr, size_t len) {}
|
| 397 |
#endif
|
| 398 |
};
|
| 399 |
|
|
|
|
| 402 |
uint8_t * addr = NULL;
|
| 403 |
size_t size = 0;
|
| 404 |
|
| 405 |
+
llama_buffer() = default;
|
| 406 |
+
|
| 407 |
+
void resize(size_t len) {
|
| 408 |
delete[] addr;
|
| 409 |
+
addr = new uint8_t[len];
|
| 410 |
+
size = len;
|
| 411 |
}
|
| 412 |
|
| 413 |
~llama_buffer() {
|
| 414 |
delete[] addr;
|
| 415 |
}
|
| 416 |
+
|
| 417 |
+
// disable copy and move
|
| 418 |
+
llama_buffer(const llama_buffer&) = delete;
|
| 419 |
+
llama_buffer(llama_buffer&&) = delete;
|
| 420 |
+
llama_buffer& operator=(const llama_buffer&) = delete;
|
| 421 |
+
llama_buffer& operator=(llama_buffer&&) = delete;
|
| 422 |
};
|
| 423 |
|
| 424 |
#ifdef GGML_USE_CUBLAS
|
| 425 |
#include "ggml-cuda.h"
|
| 426 |
struct llama_ctx_buffer {
|
| 427 |
uint8_t * addr = NULL;
|
| 428 |
+
bool is_cuda;
|
| 429 |
size_t size = 0;
|
| 430 |
|
| 431 |
+
llama_ctx_buffer() = default;
|
| 432 |
+
|
| 433 |
void resize(size_t size) {
|
| 434 |
+
free();
|
| 435 |
+
|
| 436 |
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
| 437 |
if (addr) {
|
| 438 |
+
is_cuda = true;
|
| 439 |
+
}
|
| 440 |
+
else {
|
| 441 |
+
// fall back to pageable memory
|
| 442 |
+
addr = new uint8_t[size];
|
| 443 |
+
is_cuda = false;
|
| 444 |
}
|
|
|
|
| 445 |
this->size = size;
|
| 446 |
}
|
| 447 |
|
| 448 |
+
void free() {
|
| 449 |
if (addr) {
|
| 450 |
+
if (is_cuda) {
|
| 451 |
+
ggml_cuda_host_free(addr);
|
| 452 |
+
}
|
| 453 |
+
else {
|
| 454 |
+
delete[] addr;
|
| 455 |
+
}
|
| 456 |
}
|
| 457 |
+
addr = NULL;
|
| 458 |
}
|
| 459 |
+
|
| 460 |
+
~llama_ctx_buffer() {
|
| 461 |
+
free();
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
// disable copy and move
|
| 465 |
+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
| 466 |
+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
| 467 |
+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
| 468 |
+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
| 469 |
};
|
| 470 |
#else
|
| 471 |
typedef llama_buffer llama_ctx_buffer;
|
examples/talk-llama/llama.cpp
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
// Defines fileno on msys:
|
| 2 |
#ifndef _GNU_SOURCE
|
| 3 |
#define _GNU_SOURCE
|
|
|
|
| 4 |
#include <cstdint>
|
| 5 |
#include <cstdio>
|
| 6 |
#endif
|
|
@@ -45,6 +46,7 @@ enum e_model {
|
|
| 45 |
MODEL_65B,
|
| 46 |
};
|
| 47 |
|
|
|
|
| 48 |
static const size_t MB = 1024*1024;
|
| 49 |
|
| 50 |
// computed for n_ctx == 2048
|
|
@@ -110,7 +112,7 @@ struct llama_hparams {
|
|
| 110 |
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
| 111 |
|
| 112 |
bool operator!=(const llama_hparams & other) const {
|
| 113 |
-
return memcmp(this, &other, sizeof(llama_hparams));
|
| 114 |
}
|
| 115 |
};
|
| 116 |
|
|
@@ -406,6 +408,7 @@ enum llama_file_version {
|
|
| 406 |
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
| 407 |
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
| 408 |
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
|
|
|
| 409 |
};
|
| 410 |
|
| 411 |
struct llama_file_loader {
|
|
@@ -424,24 +427,30 @@ struct llama_file_loader {
|
|
| 424 |
}
|
| 425 |
void read_magic() {
|
| 426 |
uint32_t magic = file.read_u32();
|
| 427 |
-
uint32_t version = 0;
|
| 428 |
|
| 429 |
-
if (magic
|
| 430 |
-
|
|
|
|
| 431 |
}
|
| 432 |
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
| 444 |
}
|
|
|
|
|
|
|
|
|
|
| 445 |
}
|
| 446 |
void read_hparams() {
|
| 447 |
hparams.n_vocab = file.read_u32();
|
|
@@ -499,7 +508,7 @@ struct llama_file_loader {
|
|
| 499 |
|
| 500 |
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
| 501 |
// skip to the next multiple of 32 bytes
|
| 502 |
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
| 503 |
}
|
| 504 |
shard.file_idx = file_idx;
|
| 505 |
shard.file_off = file.tell();
|
|
@@ -574,7 +583,7 @@ struct llama_file_saver {
|
|
| 574 |
file.write_u32(new_type);
|
| 575 |
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
| 576 |
file.write_raw(tensor.name.data(), tensor.name.size());
|
| 577 |
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
| 578 |
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
| 579 |
file.write_raw(new_data, new_size);
|
| 580 |
}
|
|
@@ -641,7 +650,7 @@ struct llama_model_loader {
|
|
| 641 |
}
|
| 642 |
}
|
| 643 |
|
| 644 |
-
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
| 645 |
auto it = tensors_map.name_to_idx.find(name);
|
| 646 |
if (it == tensors_map.name_to_idx.end()) {
|
| 647 |
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
|
@@ -652,10 +661,10 @@ struct llama_model_loader {
|
|
| 652 |
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
| 653 |
}
|
| 654 |
|
| 655 |
-
return get_tensor_for(lt);
|
| 656 |
}
|
| 657 |
|
| 658 |
-
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
| 659 |
struct ggml_tensor * tensor;
|
| 660 |
if (lt.ne.size() == 2) {
|
| 661 |
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
|
@@ -665,6 +674,7 @@ struct llama_model_loader {
|
|
| 665 |
}
|
| 666 |
ggml_set_name(tensor, lt.name.c_str());
|
| 667 |
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
|
|
|
| 668 |
lt.ggml_tensor = tensor;
|
| 669 |
num_ggml_tensors_created++;
|
| 670 |
return tensor;
|
|
@@ -678,12 +688,16 @@ struct llama_model_loader {
|
|
| 678 |
|
| 679 |
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
| 680 |
size_t data_size = 0;
|
|
|
|
| 681 |
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
| 682 |
data_size += lt.size;
|
|
|
|
|
|
|
|
|
|
| 683 |
}
|
| 684 |
|
| 685 |
if (use_mmap) {
|
| 686 |
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
| 687 |
if (!lmlock) {
|
| 688 |
// Don't call the callback since the actual loading will be lazy
|
| 689 |
// and we can't measure it.
|
|
@@ -696,6 +710,9 @@ struct llama_model_loader {
|
|
| 696 |
|
| 697 |
size_t done_size = 0;
|
| 698 |
for (llama_load_tensor & lt : tensors_map.tensors) {
|
|
|
|
|
|
|
|
|
|
| 699 |
if (progress_callback) {
|
| 700 |
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
| 701 |
}
|
|
@@ -708,9 +725,6 @@ struct llama_model_loader {
|
|
| 708 |
lmlock->grow_to(done_size);
|
| 709 |
}
|
| 710 |
}
|
| 711 |
-
if (progress_callback) {
|
| 712 |
-
progress_callback(1.0f, progress_callback_user_data);
|
| 713 |
-
}
|
| 714 |
}
|
| 715 |
|
| 716 |
void load_data_for(llama_load_tensor & lt) {
|
|
@@ -812,10 +826,9 @@ static bool kv_cache_init(
|
|
| 812 |
struct llama_context_params llama_context_default_params() {
|
| 813 |
struct llama_context_params result = {
|
| 814 |
/*.n_ctx =*/ 512,
|
| 815 |
-
/*.n_parts =*/ -1,
|
| 816 |
/*.gpu_layers =*/ 0,
|
| 817 |
/*.seed =*/ -1,
|
| 818 |
-
/*.f16_kv =*/
|
| 819 |
/*.logits_all =*/ false,
|
| 820 |
/*.vocab_only =*/ false,
|
| 821 |
/*.use_mmap =*/ true,
|
|
@@ -836,6 +849,21 @@ bool llama_mlock_supported() {
|
|
| 836 |
return llama_mlock::SUPPORTED;
|
| 837 |
}
|
| 838 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
//
|
| 840 |
// model loading
|
| 841 |
//
|
|
@@ -845,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
| 845 |
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
| 846 |
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
| 847 |
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
| 848 |
-
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (
|
|
|
|
| 849 |
}
|
| 850 |
|
| 851 |
return "unknown";
|
|
@@ -925,11 +954,19 @@ static void llama_model_load_internal(
|
|
| 925 |
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
| 926 |
}
|
| 927 |
|
| 928 |
-
if (file_version
|
| 929 |
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
| 930 |
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
| 931 |
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
| 932 |
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
}
|
| 934 |
}
|
| 935 |
|
|
@@ -942,27 +979,7 @@ static void llama_model_load_internal(
|
|
| 942 |
size_t ctx_size;
|
| 943 |
size_t mmapped_size;
|
| 944 |
ml->calc_sizes(&ctx_size, &mmapped_size);
|
| 945 |
-
fprintf(stderr, "%s: ggml ctx size = %
|
| 946 |
-
|
| 947 |
-
// print memory requirements
|
| 948 |
-
{
|
| 949 |
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
| 950 |
-
|
| 951 |
-
// this is the total memory required to run the inference
|
| 952 |
-
const size_t mem_required =
|
| 953 |
-
ctx_size +
|
| 954 |
-
mmapped_size +
|
| 955 |
-
MEM_REQ_SCRATCH0().at(model.type) +
|
| 956 |
-
MEM_REQ_SCRATCH1().at(model.type) +
|
| 957 |
-
MEM_REQ_EVAL().at(model.type);
|
| 958 |
-
|
| 959 |
-
// this is the memory required by one llama_state
|
| 960 |
-
const size_t mem_required_state =
|
| 961 |
-
scale*MEM_REQ_KV_SELF().at(model.type);
|
| 962 |
-
|
| 963 |
-
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
| 964 |
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
| 965 |
-
}
|
| 966 |
|
| 967 |
// create the ggml context
|
| 968 |
{
|
|
@@ -984,7 +1001,14 @@ static void llama_model_load_internal(
|
|
| 984 |
}
|
| 985 |
}
|
| 986 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
// prepare memory for the weights
|
|
|
|
| 988 |
{
|
| 989 |
const uint32_t n_embd = hparams.n_embd;
|
| 990 |
const uint32_t n_layer = hparams.n_layer;
|
|
@@ -992,70 +1016,122 @@ static void llama_model_load_internal(
|
|
| 992 |
|
| 993 |
ml->ggml_ctx = ctx;
|
| 994 |
|
| 995 |
-
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
| 996 |
-
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
| 997 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 998 |
|
| 999 |
model.layers.resize(n_layer);
|
| 1000 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
|
|
|
|
|
| 1001 |
auto & layer = model.layers[i];
|
| 1002 |
|
| 1003 |
std::string layers_i = "layers." + std::to_string(i);
|
| 1004 |
|
| 1005 |
-
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1006 |
|
| 1007 |
-
layer.
|
| 1008 |
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
| 1009 |
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
| 1010 |
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
| 1011 |
|
| 1012 |
-
layer.
|
|
|
|
|
|
|
| 1013 |
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
|
|
|
|
|
|
|
|
|
| 1017 |
}
|
| 1018 |
}
|
| 1019 |
|
| 1020 |
ml->done_getting_tensors();
|
| 1021 |
|
| 1022 |
-
//
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
}
|
| 1026 |
|
| 1027 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1028 |
|
| 1029 |
-
model.mapping = std::move(ml->mapping);
|
| 1030 |
#ifdef GGML_USE_CUBLAS
|
| 1031 |
-
{
|
| 1032 |
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
| 1033 |
|
| 1034 |
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1035 |
|
| 1036 |
-
|
|
|
|
|
|
|
|
|
|
| 1037 |
|
| 1038 |
-
|
| 1039 |
-
const auto & layer = model.layers[i];
|
| 1040 |
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
|
|
|
|
|
|
|
| 1048 |
}
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1052 |
}
|
|
|
|
|
|
|
| 1053 |
|
| 1054 |
-
|
|
|
|
| 1055 |
}
|
| 1056 |
-
|
| 1057 |
-
(
|
| 1058 |
-
#endif
|
| 1059 |
|
| 1060 |
// loading time will be recalculate after the first eval, so
|
| 1061 |
// we take page faults deferred by mmap() into consideration
|
|
@@ -1154,10 +1230,8 @@ static bool llama_eval_internal(
|
|
| 1154 |
{
|
| 1155 |
cur = ggml_rms_norm(ctx0, inpL);
|
| 1156 |
|
| 1157 |
-
// cur = attention_norm
|
| 1158 |
-
cur = ggml_mul(ctx0,
|
| 1159 |
-
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
| 1160 |
-
cur);
|
| 1161 |
}
|
| 1162 |
|
| 1163 |
// self-attention
|
|
@@ -1264,10 +1338,8 @@ static bool llama_eval_internal(
|
|
| 1264 |
{
|
| 1265 |
cur = ggml_rms_norm(ctx0, inpFF);
|
| 1266 |
|
| 1267 |
-
// cur = ffn_norm
|
| 1268 |
-
cur = ggml_mul(ctx0,
|
| 1269 |
-
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
| 1270 |
-
cur);
|
| 1271 |
}
|
| 1272 |
|
| 1273 |
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
|
@@ -1304,10 +1376,8 @@ static bool llama_eval_internal(
|
|
| 1304 |
|
| 1305 |
inpL = ggml_rms_norm(ctx0, inpL);
|
| 1306 |
|
| 1307 |
-
// inpL = norm
|
| 1308 |
-
inpL = ggml_mul(ctx0,
|
| 1309 |
-
ggml_repeat(ctx0, model.norm, inpL),
|
| 1310 |
-
inpL);
|
| 1311 |
|
| 1312 |
embeddings = inpL;
|
| 1313 |
}
|
|
@@ -2131,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
|
| 2131 |
unsigned * cur_percentage_p = (unsigned *) ctx;
|
| 2132 |
unsigned percentage = (unsigned) (100 * progress);
|
| 2133 |
while (percentage > *cur_percentage_p) {
|
| 2134 |
-
|
| 2135 |
fprintf(stderr, ".");
|
| 2136 |
fflush(stderr);
|
| 2137 |
if (percentage >= 100) {
|
|
@@ -2224,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2224 |
{
|
| 2225 |
uint32_t magic;
|
| 2226 |
fin.read((char *) &magic, sizeof(magic));
|
| 2227 |
-
if (magic !=
|
| 2228 |
fprintf(stderr, "%s: bad file magic\n", __func__);
|
| 2229 |
return 1;
|
| 2230 |
}
|
|
@@ -2288,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2288 |
|
| 2289 |
// maybe this should in llama_model_loader
|
| 2290 |
if (model_loader->use_mmap) {
|
| 2291 |
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */
|
| 2292 |
}
|
| 2293 |
}
|
| 2294 |
|
|
@@ -2381,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2381 |
}
|
| 2382 |
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
| 2383 |
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
| 2384 |
-
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
| 2385 |
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
| 2386 |
model_loader->load_data_for(lt);
|
| 2387 |
lt.ggml_tensor->data = lt.data;
|
|
@@ -2607,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
| 2607 |
}
|
| 2608 |
|
| 2609 |
// Sets the state reading from the specified source address
|
| 2610 |
-
size_t llama_set_state_data(struct llama_context * ctx,
|
| 2611 |
-
|
| 2612 |
|
| 2613 |
// set rng
|
| 2614 |
{
|
|
|
|
| 1 |
// Defines fileno on msys:
|
| 2 |
#ifndef _GNU_SOURCE
|
| 3 |
#define _GNU_SOURCE
|
| 4 |
+
#include <cstddef>
|
| 5 |
#include <cstdint>
|
| 6 |
#include <cstdio>
|
| 7 |
#endif
|
|
|
|
| 46 |
MODEL_65B,
|
| 47 |
};
|
| 48 |
|
| 49 |
+
|
| 50 |
static const size_t MB = 1024*1024;
|
| 51 |
|
| 52 |
// computed for n_ctx == 2048
|
|
|
|
| 112 |
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
| 113 |
|
| 114 |
bool operator!=(const llama_hparams & other) const {
|
| 115 |
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
| 116 |
}
|
| 117 |
};
|
| 118 |
|
|
|
|
| 408 |
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
| 409 |
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
| 410 |
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
| 411 |
+
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
| 412 |
};
|
| 413 |
|
| 414 |
struct llama_file_loader {
|
|
|
|
| 427 |
}
|
| 428 |
void read_magic() {
|
| 429 |
uint32_t magic = file.read_u32();
|
|
|
|
| 430 |
|
| 431 |
+
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
| 432 |
+
file_version = LLAMA_FILE_VERSION_GGML;
|
| 433 |
+
return;
|
| 434 |
}
|
| 435 |
|
| 436 |
+
uint32_t version = file.read_u32();
|
| 437 |
+
|
| 438 |
+
switch (magic) {
|
| 439 |
+
case LLAMA_FILE_MAGIC_GGMF:
|
| 440 |
+
switch (version) {
|
| 441 |
+
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
| 442 |
+
}
|
| 443 |
+
break;
|
| 444 |
+
case LLAMA_FILE_MAGIC_GGJT:
|
| 445 |
+
switch (version) {
|
| 446 |
+
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
| 447 |
+
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
| 448 |
+
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
| 449 |
+
}
|
| 450 |
}
|
| 451 |
+
|
| 452 |
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
| 453 |
+
magic, version);
|
| 454 |
}
|
| 455 |
void read_hparams() {
|
| 456 |
hparams.n_vocab = file.read_u32();
|
|
|
|
| 508 |
|
| 509 |
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
| 510 |
// skip to the next multiple of 32 bytes
|
| 511 |
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
| 512 |
}
|
| 513 |
shard.file_idx = file_idx;
|
| 514 |
shard.file_off = file.tell();
|
|
|
|
| 583 |
file.write_u32(new_type);
|
| 584 |
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
| 585 |
file.write_raw(tensor.name.data(), tensor.name.size());
|
| 586 |
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
| 587 |
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
| 588 |
file.write_raw(new_data, new_size);
|
| 589 |
}
|
|
|
|
| 650 |
}
|
| 651 |
}
|
| 652 |
|
| 653 |
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
| 654 |
auto it = tensors_map.name_to_idx.find(name);
|
| 655 |
if (it == tensors_map.name_to_idx.end()) {
|
| 656 |
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
|
|
|
| 661 |
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
| 662 |
}
|
| 663 |
|
| 664 |
+
return get_tensor_for(lt, backend);
|
| 665 |
}
|
| 666 |
|
| 667 |
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
| 668 |
struct ggml_tensor * tensor;
|
| 669 |
if (lt.ne.size() == 2) {
|
| 670 |
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
|
|
|
| 674 |
}
|
| 675 |
ggml_set_name(tensor, lt.name.c_str());
|
| 676 |
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
| 677 |
+
tensor->backend = backend;
|
| 678 |
lt.ggml_tensor = tensor;
|
| 679 |
num_ggml_tensors_created++;
|
| 680 |
return tensor;
|
|
|
|
| 688 |
|
| 689 |
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
| 690 |
size_t data_size = 0;
|
| 691 |
+
size_t prefetch_size = 0;
|
| 692 |
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
| 693 |
data_size += lt.size;
|
| 694 |
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
| 695 |
+
prefetch_size += lt.size;
|
| 696 |
+
}
|
| 697 |
}
|
| 698 |
|
| 699 |
if (use_mmap) {
|
| 700 |
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
| 701 |
if (!lmlock) {
|
| 702 |
// Don't call the callback since the actual loading will be lazy
|
| 703 |
// and we can't measure it.
|
|
|
|
| 710 |
|
| 711 |
size_t done_size = 0;
|
| 712 |
for (llama_load_tensor & lt : tensors_map.tensors) {
|
| 713 |
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
| 714 |
+
continue;
|
| 715 |
+
}
|
| 716 |
if (progress_callback) {
|
| 717 |
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
| 718 |
}
|
|
|
|
| 725 |
lmlock->grow_to(done_size);
|
| 726 |
}
|
| 727 |
}
|
|
|
|
|
|
|
|
|
|
| 728 |
}
|
| 729 |
|
| 730 |
void load_data_for(llama_load_tensor & lt) {
|
|
|
|
| 826 |
struct llama_context_params llama_context_default_params() {
|
| 827 |
struct llama_context_params result = {
|
| 828 |
/*.n_ctx =*/ 512,
|
|
|
|
| 829 |
/*.gpu_layers =*/ 0,
|
| 830 |
/*.seed =*/ -1,
|
| 831 |
+
/*.f16_kv =*/ true,
|
| 832 |
/*.logits_all =*/ false,
|
| 833 |
/*.vocab_only =*/ false,
|
| 834 |
/*.use_mmap =*/ true,
|
|
|
|
| 849 |
return llama_mlock::SUPPORTED;
|
| 850 |
}
|
| 851 |
|
| 852 |
+
void llama_init_backend() {
|
| 853 |
+
ggml_time_init();
|
| 854 |
+
|
| 855 |
+
// needed to initialize f16 tables
|
| 856 |
+
{
|
| 857 |
+
struct ggml_init_params params = { 0, NULL, false };
|
| 858 |
+
struct ggml_context * ctx = ggml_init(params);
|
| 859 |
+
ggml_free(ctx);
|
| 860 |
+
}
|
| 861 |
+
}
|
| 862 |
+
|
| 863 |
+
int64_t llama_time_us() {
|
| 864 |
+
return ggml_time_us();
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
//
|
| 868 |
// model loading
|
| 869 |
//
|
|
|
|
| 873 |
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
| 874 |
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
| 875 |
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
| 876 |
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
| 877 |
+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
| 878 |
}
|
| 879 |
|
| 880 |
return "unknown";
|
|
|
|
| 954 |
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
| 955 |
}
|
| 956 |
|
| 957 |
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
| 958 |
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
| 959 |
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
| 960 |
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
| 961 |
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
| 962 |
+
}
|
| 963 |
+
}
|
| 964 |
+
|
| 965 |
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
| 966 |
+
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
| 967 |
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
| 968 |
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
| 969 |
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
| 970 |
}
|
| 971 |
}
|
| 972 |
|
|
|
|
| 979 |
size_t ctx_size;
|
| 980 |
size_t mmapped_size;
|
| 981 |
ml->calc_sizes(&ctx_size, &mmapped_size);
|
| 982 |
+
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 983 |
|
| 984 |
// create the ggml context
|
| 985 |
{
|
|
|
|
| 1001 |
}
|
| 1002 |
}
|
| 1003 |
|
| 1004 |
+
#ifdef GGML_USE_CUBLAS
|
| 1005 |
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
| 1006 |
+
#else
|
| 1007 |
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
| 1008 |
+
#endif
|
| 1009 |
+
|
| 1010 |
// prepare memory for the weights
|
| 1011 |
+
size_t vram_total = 0;
|
| 1012 |
{
|
| 1013 |
const uint32_t n_embd = hparams.n_embd;
|
| 1014 |
const uint32_t n_layer = hparams.n_layer;
|
|
|
|
| 1016 |
|
| 1017 |
ml->ggml_ctx = ctx;
|
| 1018 |
|
| 1019 |
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
| 1020 |
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
| 1021 |
+
|
| 1022 |
+
// "output" tensor
|
| 1023 |
+
{
|
| 1024 |
+
ggml_backend backend_output;
|
| 1025 |
+
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
| 1026 |
+
backend_output = LLAMA_BACKEND_OFFLOAD;
|
| 1027 |
+
} else {
|
| 1028 |
+
backend_output = GGML_BACKEND_CPU;
|
| 1029 |
+
}
|
| 1030 |
+
|
| 1031 |
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
| 1032 |
+
}
|
| 1033 |
+
|
| 1034 |
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
| 1035 |
|
| 1036 |
model.layers.resize(n_layer);
|
| 1037 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
| 1038 |
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
| 1039 |
+
|
| 1040 |
auto & layer = model.layers[i];
|
| 1041 |
|
| 1042 |
std::string layers_i = "layers." + std::to_string(i);
|
| 1043 |
|
| 1044 |
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
| 1045 |
+
|
| 1046 |
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
| 1047 |
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
| 1048 |
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
| 1049 |
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
| 1050 |
|
| 1051 |
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
|
|
|
|
|
|
|
|
|
| 1052 |
|
| 1053 |
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
| 1054 |
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
| 1055 |
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
| 1056 |
|
| 1057 |
+
if (backend == GGML_BACKEND_CUDA) {
|
| 1058 |
+
vram_total +=
|
| 1059 |
+
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
| 1060 |
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
| 1061 |
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
| 1062 |
+
}
|
| 1063 |
}
|
| 1064 |
}
|
| 1065 |
|
| 1066 |
ml->done_getting_tensors();
|
| 1067 |
|
| 1068 |
+
// print memory requirements
|
| 1069 |
+
{
|
| 1070 |
+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
|
|
| 1071 |
|
| 1072 |
+
// this is the total memory required to run the inference
|
| 1073 |
+
const size_t mem_required =
|
| 1074 |
+
ctx_size +
|
| 1075 |
+
mmapped_size - vram_total + // weights in VRAM not in memory
|
| 1076 |
+
MEM_REQ_SCRATCH0().at(model.type) +
|
| 1077 |
+
MEM_REQ_SCRATCH1().at(model.type) +
|
| 1078 |
+
MEM_REQ_EVAL().at(model.type);
|
| 1079 |
+
|
| 1080 |
+
// this is the memory required by one llama_state
|
| 1081 |
+
const size_t mem_required_state =
|
| 1082 |
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
| 1083 |
+
|
| 1084 |
+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
| 1085 |
+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
| 1086 |
|
|
|
|
| 1087 |
#ifdef GGML_USE_CUBLAS
|
|
|
|
| 1088 |
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
| 1089 |
|
| 1090 |
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
| 1091 |
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
| 1092 |
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
| 1093 |
+
}
|
| 1094 |
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
| 1095 |
+
#else
|
| 1096 |
+
(void) n_gpu_layers;
|
| 1097 |
+
#endif
|
| 1098 |
+
}
|
| 1099 |
|
| 1100 |
+
// populate `tensors_by_name`
|
| 1101 |
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
| 1102 |
+
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
| 1103 |
+
}
|
| 1104 |
|
| 1105 |
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
|
|
|
| 1106 |
|
| 1107 |
+
#ifdef GGML_USE_CUBLAS
|
| 1108 |
+
{
|
| 1109 |
+
size_t done_size = 0;
|
| 1110 |
+
size_t data_size = 0;
|
| 1111 |
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
| 1112 |
+
data_size += lt.size;
|
| 1113 |
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
| 1114 |
+
done_size += lt.size;
|
| 1115 |
+
}
|
| 1116 |
}
|
| 1117 |
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
| 1118 |
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
| 1119 |
+
continue;
|
| 1120 |
+
}
|
| 1121 |
+
if (progress_callback) {
|
| 1122 |
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
| 1123 |
+
}
|
| 1124 |
+
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
| 1125 |
+
done_size += lt.size;
|
| 1126 |
}
|
| 1127 |
+
}
|
| 1128 |
+
#endif // GGML_USE_CUBLAS
|
| 1129 |
|
| 1130 |
+
if (progress_callback) {
|
| 1131 |
+
progress_callback(1.0f, progress_callback_user_data);
|
| 1132 |
}
|
| 1133 |
+
|
| 1134 |
+
model.mapping = std::move(ml->mapping);
|
|
|
|
| 1135 |
|
| 1136 |
// loading time will be recalculate after the first eval, so
|
| 1137 |
// we take page faults deferred by mmap() into consideration
|
|
|
|
| 1230 |
{
|
| 1231 |
cur = ggml_rms_norm(ctx0, inpL);
|
| 1232 |
|
| 1233 |
+
// cur = cur*attention_norm(broadcasted)
|
| 1234 |
+
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
|
|
|
|
|
|
| 1235 |
}
|
| 1236 |
|
| 1237 |
// self-attention
|
|
|
|
| 1338 |
{
|
| 1339 |
cur = ggml_rms_norm(ctx0, inpFF);
|
| 1340 |
|
| 1341 |
+
// cur = cur*ffn_norm(broadcasted)
|
| 1342 |
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
|
|
|
|
|
|
| 1343 |
}
|
| 1344 |
|
| 1345 |
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
|
|
|
| 1376 |
|
| 1377 |
inpL = ggml_rms_norm(ctx0, inpL);
|
| 1378 |
|
| 1379 |
+
// inpL = inpL*norm(broadcasted)
|
| 1380 |
+
inpL = ggml_mul(ctx0, inpL, model.norm);
|
|
|
|
|
|
|
| 1381 |
|
| 1382 |
embeddings = inpL;
|
| 1383 |
}
|
|
|
|
| 2201 |
unsigned * cur_percentage_p = (unsigned *) ctx;
|
| 2202 |
unsigned percentage = (unsigned) (100 * progress);
|
| 2203 |
while (percentage > *cur_percentage_p) {
|
| 2204 |
+
*cur_percentage_p = percentage;
|
| 2205 |
fprintf(stderr, ".");
|
| 2206 |
fflush(stderr);
|
| 2207 |
if (percentage >= 100) {
|
|
|
|
| 2294 |
{
|
| 2295 |
uint32_t magic;
|
| 2296 |
fin.read((char *) &magic, sizeof(magic));
|
| 2297 |
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
| 2298 |
fprintf(stderr, "%s: bad file magic\n", __func__);
|
| 2299 |
return 1;
|
| 2300 |
}
|
|
|
|
| 2358 |
|
| 2359 |
// maybe this should in llama_model_loader
|
| 2360 |
if (model_loader->use_mmap) {
|
| 2361 |
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
| 2362 |
}
|
| 2363 |
}
|
| 2364 |
|
|
|
|
| 2451 |
}
|
| 2452 |
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
| 2453 |
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
| 2454 |
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
| 2455 |
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
| 2456 |
model_loader->load_data_for(lt);
|
| 2457 |
lt.ggml_tensor->data = lt.data;
|
|
|
|
| 2677 |
}
|
| 2678 |
|
| 2679 |
// Sets the state reading from the specified source address
|
| 2680 |
+
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
| 2681 |
+
uint8_t * inp = src;
|
| 2682 |
|
| 2683 |
// set rng
|
| 2684 |
{
|
examples/talk-llama/llama.h
CHANGED
|
@@ -19,10 +19,16 @@
|
|
| 19 |
# define LLAMA_API
|
| 20 |
#endif
|
| 21 |
|
| 22 |
-
#define
|
| 23 |
-
#define
|
| 24 |
-
#define
|
| 25 |
-
#define
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
#define LLAMA_SESSION_VERSION 1
|
| 27 |
|
| 28 |
#ifdef __cplusplus
|
|
@@ -40,9 +46,9 @@ extern "C" {
|
|
| 40 |
typedef int llama_token;
|
| 41 |
|
| 42 |
typedef struct llama_token_data {
|
| 43 |
-
llama_token id;
|
| 44 |
-
float logit;
|
| 45 |
-
float p;
|
| 46 |
} llama_token_data;
|
| 47 |
|
| 48 |
typedef struct llama_token_data_array {
|
|
@@ -55,7 +61,6 @@ extern "C" {
|
|
| 55 |
|
| 56 |
struct llama_context_params {
|
| 57 |
int n_ctx; // text context
|
| 58 |
-
int n_parts; // -1 for default
|
| 59 |
int n_gpu_layers; // number of layers to store in VRAM
|
| 60 |
int seed; // RNG seed, -1 for random
|
| 61 |
|
|
@@ -74,16 +79,16 @@ extern "C" {
|
|
| 74 |
|
| 75 |
// model file types
|
| 76 |
enum llama_ftype {
|
| 77 |
-
LLAMA_FTYPE_ALL_F32
|
| 78 |
-
LLAMA_FTYPE_MOSTLY_F16
|
| 79 |
-
LLAMA_FTYPE_MOSTLY_Q4_0
|
| 80 |
-
LLAMA_FTYPE_MOSTLY_Q4_1
|
| 81 |
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 82 |
-
// LLAMA_FTYPE_MOSTLY_Q4_2
|
| 83 |
-
// LLAMA_FTYPE_MOSTLY_Q4_3
|
| 84 |
-
LLAMA_FTYPE_MOSTLY_Q8_0
|
| 85 |
-
LLAMA_FTYPE_MOSTLY_Q5_0
|
| 86 |
-
LLAMA_FTYPE_MOSTLY_Q5_1
|
| 87 |
};
|
| 88 |
|
| 89 |
LLAMA_API struct llama_context_params llama_context_default_params();
|
|
@@ -91,6 +96,13 @@ extern "C" {
|
|
| 91 |
LLAMA_API bool llama_mmap_supported();
|
| 92 |
LLAMA_API bool llama_mlock_supported();
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
// Various functions for loading a ggml llama model.
|
| 95 |
// Allocate (almost) all memory needed for the model.
|
| 96 |
// Return NULL on failure
|
|
@@ -139,7 +151,7 @@ extern "C" {
|
|
| 139 |
|
| 140 |
// Set the state reading from the specified address
|
| 141 |
// Returns the number of bytes read
|
| 142 |
-
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx,
|
| 143 |
|
| 144 |
// Save/load session file
|
| 145 |
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
|
|
|
| 19 |
# define LLAMA_API
|
| 20 |
#endif
|
| 21 |
|
| 22 |
+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
| 23 |
+
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
| 24 |
+
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
| 25 |
+
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
| 26 |
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 27 |
+
|
| 28 |
+
#define LLAMA_FILE_VERSION 3
|
| 29 |
+
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
| 30 |
+
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
| 31 |
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 32 |
#define LLAMA_SESSION_VERSION 1
|
| 33 |
|
| 34 |
#ifdef __cplusplus
|
|
|
|
| 46 |
typedef int llama_token;
|
| 47 |
|
| 48 |
typedef struct llama_token_data {
|
| 49 |
+
llama_token id; // token id
|
| 50 |
+
float logit; // log-odds of the token
|
| 51 |
+
float p; // probability of the token
|
| 52 |
} llama_token_data;
|
| 53 |
|
| 54 |
typedef struct llama_token_data_array {
|
|
|
|
| 61 |
|
| 62 |
struct llama_context_params {
|
| 63 |
int n_ctx; // text context
|
|
|
|
| 64 |
int n_gpu_layers; // number of layers to store in VRAM
|
| 65 |
int seed; // RNG seed, -1 for random
|
| 66 |
|
|
|
|
| 79 |
|
| 80 |
// model file types
|
| 81 |
enum llama_ftype {
|
| 82 |
+
LLAMA_FTYPE_ALL_F32 = 0,
|
| 83 |
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
| 84 |
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 85 |
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 86 |
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 87 |
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
| 88 |
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
| 89 |
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 90 |
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 91 |
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
| 92 |
};
|
| 93 |
|
| 94 |
LLAMA_API struct llama_context_params llama_context_default_params();
|
|
|
|
| 96 |
LLAMA_API bool llama_mmap_supported();
|
| 97 |
LLAMA_API bool llama_mlock_supported();
|
| 98 |
|
| 99 |
+
// TODO: not great API - very likely to change
|
| 100 |
+
// Initialize the llama + ggml backend
|
| 101 |
+
// Call once at the start of the program
|
| 102 |
+
LLAMA_API void llama_init_backend();
|
| 103 |
+
|
| 104 |
+
LLAMA_API int64_t llama_time_us();
|
| 105 |
+
|
| 106 |
// Various functions for loading a ggml llama model.
|
| 107 |
// Allocate (almost) all memory needed for the model.
|
| 108 |
// Return NULL on failure
|
|
|
|
| 151 |
|
| 152 |
// Set the state reading from the specified address
|
| 153 |
// Returns the number of bytes read
|
| 154 |
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
| 155 |
|
| 156 |
// Save/load session file
|
| 157 |
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -33,8 +33,6 @@ struct whisper_params {
|
|
| 33 |
int32_t max_tokens = 32;
|
| 34 |
int32_t audio_ctx = 0;
|
| 35 |
|
| 36 |
-
int32_t n_parts_llama = -1;
|
| 37 |
-
|
| 38 |
float vad_thold = 0.6f;
|
| 39 |
float freq_thold = 100.0f;
|
| 40 |
|
|
@@ -72,7 +70,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 72 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 73 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 74 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
| 75 |
-
else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
|
| 76 |
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 77 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 78 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
@@ -123,7 +120,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 123 |
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
| 124 |
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
| 125 |
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
|
| 126 |
-
fprintf(stderr, " --n-parts-llama N [%-7d] num parts in llama model file\n", params.n_parts_llama);
|
| 127 |
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
| 128 |
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
|
| 129 |
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
|
|
@@ -239,13 +235,14 @@ int main(int argc, char ** argv) {
|
|
| 239 |
|
| 240 |
// llama init
|
| 241 |
|
|
|
|
|
|
|
| 242 |
auto lparams = llama_context_default_params();
|
| 243 |
|
| 244 |
// tune these to your liking
|
| 245 |
lparams.n_ctx = 2048;
|
| 246 |
lparams.seed = 1;
|
| 247 |
lparams.f16_kv = true;
|
| 248 |
-
lparams.n_parts = params.n_parts_llama;
|
| 249 |
|
| 250 |
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
|
| 251 |
|
|
|
|
| 33 |
int32_t max_tokens = 32;
|
| 34 |
int32_t audio_ctx = 0;
|
| 35 |
|
|
|
|
|
|
|
| 36 |
float vad_thold = 0.6f;
|
| 37 |
float freq_thold = 100.0f;
|
| 38 |
|
|
|
|
| 70 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 71 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 72 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
|
|
|
| 73 |
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 74 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 75 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
|
|
| 120 |
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
| 121 |
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
| 122 |
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
|
|
|
|
| 123 |
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
| 124 |
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
|
| 125 |
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
|
|
|
|
| 235 |
|
| 236 |
// llama init
|
| 237 |
|
| 238 |
+
llama_init_backend();
|
| 239 |
+
|
| 240 |
auto lparams = llama_context_default_params();
|
| 241 |
|
| 242 |
// tune these to your liking
|
| 243 |
lparams.n_ctx = 2048;
|
| 244 |
lparams.seed = 1;
|
| 245 |
lparams.f16_kv = true;
|
|
|
|
| 246 |
|
| 247 |
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
|
| 248 |
|