Spaces:
Running
Running
ggml : sync latest ggml
Browse files- New Q4 and Q5 formats
- Various improvements
- examples/common-ggml.cpp +0 -6
- examples/common.cpp +34 -0
- examples/common.h +3 -0
- examples/quantize/quantize.cpp +10 -4
- ggml-cuda.cu +291 -109
- ggml-cuda.h +2 -0
- ggml-opencl.c +85 -122
- ggml.c +0 -0
- ggml.h +205 -12
- whisper.cpp +31 -38
examples/common-ggml.cpp
CHANGED
|
@@ -6,7 +6,6 @@
|
|
| 6 |
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
| 7 |
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
| 8 |
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
| 9 |
-
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
|
| 10 |
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
| 11 |
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
| 12 |
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
|
@@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
|
|
| 46 |
switch (ftype) {
|
| 47 |
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
| 48 |
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
| 49 |
-
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
|
| 50 |
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
| 51 |
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
| 52 |
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
|
@@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
|
|
| 171 |
{
|
| 172 |
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 173 |
} break;
|
| 174 |
-
case GGML_TYPE_Q4_2:
|
| 175 |
-
{
|
| 176 |
-
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 177 |
-
} break;
|
| 178 |
case GGML_TYPE_Q5_0:
|
| 179 |
{
|
| 180 |
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
|
|
|
| 6 |
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
| 7 |
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
| 8 |
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
|
|
|
| 9 |
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
| 10 |
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
| 11 |
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
|
|
|
| 45 |
switch (ftype) {
|
| 46 |
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
| 47 |
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
|
|
|
| 48 |
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
| 49 |
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
| 50 |
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
|
|
|
| 169 |
{
|
| 170 |
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 171 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
case GGML_TYPE_Q5_0:
|
| 173 |
{
|
| 174 |
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
examples/common.cpp
CHANGED
|
@@ -38,6 +38,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
| 38 |
} else if (arg == "-h" || arg == "--help") {
|
| 39 |
gpt_print_usage(argc, argv, params);
|
| 40 |
exit(0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
} else {
|
| 42 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 43 |
gpt_print_usage(argc, argv, params);
|
|
@@ -57,6 +71,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
| 57 |
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
| 58 |
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
| 59 |
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
|
|
|
|
|
|
| 60 |
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
| 61 |
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
| 62 |
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
|
@@ -192,6 +208,10 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
|
| 192 |
return result;
|
| 193 |
}
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
| 196 |
std::vector<std::string> words;
|
| 197 |
|
|
@@ -200,6 +220,20 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|
| 200 |
std::string str = text;
|
| 201 |
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
std::regex re(pat);
|
| 204 |
std::smatch m;
|
| 205 |
|
|
|
|
| 38 |
} else if (arg == "-h" || arg == "--help") {
|
| 39 |
gpt_print_usage(argc, argv, params);
|
| 40 |
exit(0);
|
| 41 |
+
} else if (arg == "-f" || arg == "--file") {
|
| 42 |
+
if (++i > argc) {
|
| 43 |
+
fprintf(stderr, "Invalid file param");
|
| 44 |
+
break;
|
| 45 |
+
}
|
| 46 |
+
std::ifstream file(argv[i]);
|
| 47 |
+
if (!file) {
|
| 48 |
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
| 49 |
+
break;
|
| 50 |
+
}
|
| 51 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
| 52 |
+
if (params.prompt.back() == '\n') {
|
| 53 |
+
params.prompt.pop_back();
|
| 54 |
+
}
|
| 55 |
} else {
|
| 56 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 57 |
gpt_print_usage(argc, argv, params);
|
|
|
|
| 71 |
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
| 72 |
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
| 73 |
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
| 74 |
+
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
| 75 |
+
fprintf(stderr, " load prompt from a file\n");
|
| 76 |
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
| 77 |
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
| 78 |
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
|
|
|
| 208 |
return result;
|
| 209 |
}
|
| 210 |
|
| 211 |
+
void gpt_vocab::add_special_token(const std::string & token) {
|
| 212 |
+
special_tokens.push_back(token);
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
| 216 |
std::vector<std::string> words;
|
| 217 |
|
|
|
|
| 220 |
std::string str = text;
|
| 221 |
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 222 |
|
| 223 |
+
// Generate the subpattern from the special_tokens vector if it's not empty
|
| 224 |
+
if (!vocab.special_tokens.empty()) {
|
| 225 |
+
std::string special_tokens_subpattern;
|
| 226 |
+
for (const auto & token : vocab.special_tokens) {
|
| 227 |
+
if (!special_tokens_subpattern.empty()) {
|
| 228 |
+
special_tokens_subpattern += "|";
|
| 229 |
+
}
|
| 230 |
+
special_tokens_subpattern += token;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
// Modify the regex pattern with the generated special tokens subpattern
|
| 234 |
+
pat = special_tokens_subpattern + "|" + pat;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
std::regex re(pat);
|
| 238 |
std::smatch m;
|
| 239 |
|
examples/common.h
CHANGED
|
@@ -53,6 +53,9 @@ struct gpt_vocab {
|
|
| 53 |
|
| 54 |
std::map<token, id> token_to_id;
|
| 55 |
std::map<id, token> id_to_token;
|
|
|
|
|
|
|
|
|
|
| 56 |
};
|
| 57 |
|
| 58 |
// poor-man's JSON parsing
|
|
|
|
| 53 |
|
| 54 |
std::map<token, id> token_to_id;
|
| 55 |
std::map<id, token> id_to_token;
|
| 56 |
+
std::vector<std::string> special_tokens;
|
| 57 |
+
|
| 58 |
+
void add_special_token(const std::string & token);
|
| 59 |
};
|
| 60 |
|
| 61 |
// poor-man's JSON parsing
|
examples/quantize/quantize.cpp
CHANGED
|
@@ -25,7 +25,7 @@ struct whisper_hparams {
|
|
| 25 |
int32_t n_text_head = 6;
|
| 26 |
int32_t n_text_layer = 4;
|
| 27 |
int32_t n_mels = 80;
|
| 28 |
-
int32_t
|
| 29 |
};
|
| 30 |
|
| 31 |
struct whisper_filters {
|
|
@@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
|
| 79 |
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 80 |
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 81 |
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 82 |
-
finp.read((char *) &hparams.
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 85 |
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
|
@@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
|
| 91 |
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
| 92 |
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 93 |
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 94 |
-
fprintf(stderr, "%s:
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 97 |
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
|
|
@@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
|
| 103 |
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 104 |
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 105 |
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 106 |
-
fout.write((char *) &
|
| 107 |
}
|
| 108 |
|
| 109 |
// load mel filters
|
|
|
|
| 25 |
int32_t n_text_head = 6;
|
| 26 |
int32_t n_text_layer = 4;
|
| 27 |
int32_t n_mels = 80;
|
| 28 |
+
int32_t ftype = 1;
|
| 29 |
};
|
| 30 |
|
| 31 |
struct whisper_filters {
|
|
|
|
| 79 |
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 80 |
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 81 |
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 82 |
+
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
| 83 |
+
|
| 84 |
+
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
| 85 |
+
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
| 86 |
|
| 87 |
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 88 |
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
|
|
|
| 94 |
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
| 95 |
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 96 |
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 97 |
+
fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
| 98 |
+
fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
| 99 |
+
fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
| 100 |
+
fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
| 101 |
|
| 102 |
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 103 |
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
|
|
|
|
| 109 |
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 110 |
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 111 |
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 112 |
+
fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
|
| 113 |
}
|
| 114 |
|
| 115 |
// load mel filters
|
ggml-cuda.cu
CHANGED
|
@@ -32,9 +32,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
| 32 |
} \
|
| 33 |
} while (0)
|
| 34 |
|
|
|
|
| 35 |
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
#define QK4_0 32
|
|
|
|
| 38 |
typedef struct {
|
| 39 |
float d; // delta
|
| 40 |
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
|
@@ -42,6 +48,7 @@ typedef struct {
|
|
| 42 |
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
| 43 |
|
| 44 |
#define QK4_1 32
|
|
|
|
| 45 |
typedef struct {
|
| 46 |
float d; // delta
|
| 47 |
float m; // min
|
|
@@ -49,14 +56,8 @@ typedef struct {
|
|
| 49 |
} block_q4_1;
|
| 50 |
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
| 51 |
|
| 52 |
-
#define QK4_2 16
|
| 53 |
-
typedef struct {
|
| 54 |
-
half d; // delta
|
| 55 |
-
uint8_t qs[QK4_2 / 2]; // nibbles / quants
|
| 56 |
-
} block_q4_2;
|
| 57 |
-
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
|
| 58 |
-
|
| 59 |
#define QK5_0 32
|
|
|
|
| 60 |
typedef struct {
|
| 61 |
half d; // delta
|
| 62 |
uint8_t qh[4]; // 5-th bit of quants
|
|
@@ -65,6 +66,7 @@ typedef struct {
|
|
| 65 |
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
| 66 |
|
| 67 |
#define QK5_1 32
|
|
|
|
| 68 |
typedef struct {
|
| 69 |
half d; // delta
|
| 70 |
half m; // min
|
|
@@ -74,112 +76,164 @@ typedef struct {
|
|
| 74 |
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
| 75 |
|
| 76 |
#define QK8_0 32
|
|
|
|
| 77 |
typedef struct {
|
| 78 |
float d; // delta
|
| 79 |
int8_t qs[QK8_0]; // quants
|
| 80 |
} block_q8_0;
|
| 81 |
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
|
| 82 |
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 85 |
|
| 86 |
-
const
|
| 87 |
|
| 88 |
-
const
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
| 96 |
-
const int8_t vi1 = vi >> 4;
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
}
|
| 104 |
}
|
| 105 |
|
| 106 |
-
static
|
| 107 |
-
const
|
| 108 |
|
| 109 |
-
const
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
const
|
|
|
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
|
|
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
}
|
| 129 |
|
| 130 |
-
static
|
| 131 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
const int i = blockIdx.x;
|
| 134 |
|
| 135 |
const float d = x[i].d;
|
| 136 |
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
-
|
| 143 |
-
const int8_t vi1 = vi >> 4;
|
| 144 |
|
| 145 |
-
|
| 146 |
-
const float v1 = (vi1 - 8)*d;
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
}
|
| 151 |
}
|
| 152 |
|
| 153 |
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
|
|
|
|
|
|
|
| 154 |
const block_q5_0 * x = (const block_q5_0 *) vx;
|
| 155 |
|
| 156 |
const int i = blockIdx.x;
|
| 157 |
|
| 158 |
const float d = x[i].d;
|
| 159 |
|
| 160 |
-
const uint8_t * pp = x[i].qs;
|
| 161 |
-
|
| 162 |
uint32_t qh;
|
| 163 |
memcpy(&qh, x[i].qh, sizeof(qh));
|
| 164 |
|
| 165 |
-
for (int
|
| 166 |
-
const uint8_t
|
|
|
|
| 167 |
|
| 168 |
-
const
|
| 169 |
-
const
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
const float v0 = (vi0 - 16)*d;
|
| 175 |
-
const float v1 = (vi1 - 16)*d;
|
| 176 |
-
|
| 177 |
-
y[i*QK5_0 + l + 0] = v0;
|
| 178 |
-
y[i*QK5_0 + l + 1] = v1;
|
| 179 |
}
|
| 180 |
}
|
| 181 |
|
| 182 |
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
|
|
|
|
|
|
|
| 183 |
const block_q5_1 * x = (const block_q5_1 *) vx;
|
| 184 |
|
| 185 |
const int i = blockIdx.x;
|
|
@@ -187,41 +241,70 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
|
|
| 187 |
const float d = x[i].d;
|
| 188 |
const float m = x[i].m;
|
| 189 |
|
| 190 |
-
const uint8_t * pp = x[i].qs;
|
| 191 |
-
|
| 192 |
uint32_t qh;
|
| 193 |
memcpy(&qh, x[i].qh, sizeof(qh));
|
| 194 |
|
| 195 |
-
for (int
|
| 196 |
-
const uint8_t
|
|
|
|
| 197 |
|
| 198 |
-
const
|
| 199 |
-
const
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
const float v0 = vi0*d + m;
|
| 205 |
-
const float v1 = vi1*d + m;
|
| 206 |
-
|
| 207 |
-
y[i*QK5_1 + l + 0] = v0;
|
| 208 |
-
y[i*QK5_1 + l + 1] = v1;
|
| 209 |
}
|
| 210 |
}
|
| 211 |
|
| 212 |
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
|
|
|
|
|
|
|
| 213 |
const block_q8_0 * x = (const block_q8_0 *) vx;
|
| 214 |
|
| 215 |
const int i = blockIdx.x;
|
| 216 |
|
| 217 |
const float d = x[i].d;
|
| 218 |
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
| 223 |
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
}
|
| 226 |
}
|
| 227 |
|
|
@@ -235,11 +318,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
|
|
| 235 |
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
|
| 236 |
}
|
| 237 |
|
| 238 |
-
static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
| 239 |
-
const int nb = k / QK4_2;
|
| 240 |
-
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
|
| 241 |
-
}
|
| 242 |
-
|
| 243 |
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
| 244 |
const int nb = k / QK5_0;
|
| 245 |
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
|
|
@@ -255,6 +333,36 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
|
|
| 255 |
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
|
| 256 |
}
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
// TODO: optimize
|
| 259 |
static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
|
| 260 |
const half * x = (const half *) vx;
|
|
@@ -268,14 +376,18 @@ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStre
|
|
| 268 |
convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
|
| 269 |
}
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
| 272 |
switch (type) {
|
| 273 |
case GGML_TYPE_Q4_0:
|
| 274 |
return dequantize_row_q4_0_cuda;
|
| 275 |
case GGML_TYPE_Q4_1:
|
| 276 |
return dequantize_row_q4_1_cuda;
|
| 277 |
-
case GGML_TYPE_Q4_2:
|
| 278 |
-
return dequantize_row_q4_2_cuda;
|
| 279 |
case GGML_TYPE_Q5_0:
|
| 280 |
return dequantize_row_q5_0_cuda;
|
| 281 |
case GGML_TYPE_Q5_1:
|
|
@@ -289,8 +401,27 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
| 289 |
}
|
| 290 |
}
|
| 291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
// buffer pool for cuda
|
| 293 |
-
#define MAX_CUDA_BUFFERS
|
| 294 |
|
| 295 |
struct scoped_spin_lock {
|
| 296 |
std::atomic_flag& lock;
|
|
@@ -348,7 +479,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
| 348 |
CUDA_CHECK(cudaFree(ptr));
|
| 349 |
}
|
| 350 |
|
| 351 |
-
#define GGML_CUDA_MAX_STREAMS 8
|
| 352 |
#define GGML_CUDA_MAX_EVENTS 64
|
| 353 |
static cublasHandle_t g_cublasH = nullptr;
|
| 354 |
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
|
|
@@ -587,6 +718,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
|
|
| 587 |
const int nb2 = dst->nb[2];
|
| 588 |
const int nb3 = dst->nb[3];
|
| 589 |
const ggml_type type = src0->type;
|
|
|
|
| 590 |
|
| 591 |
const float alpha = 1.0f;
|
| 592 |
const float beta = 0.0f;
|
|
@@ -597,12 +729,16 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
|
|
| 597 |
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
|
| 598 |
|
| 599 |
size_t x_size, y_size, d_size, q_size;
|
| 600 |
-
float * d_X =
|
|
|
|
|
|
|
|
|
|
| 601 |
float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
|
| 602 |
float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
|
| 603 |
char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
|
| 604 |
|
| 605 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
|
|
|
|
| 606 |
GGML_ASSERT(to_fp32_cuda != nullptr);
|
| 607 |
|
| 608 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
@@ -612,31 +748,54 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
|
|
| 612 |
cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
|
| 613 |
cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
|
| 614 |
|
| 615 |
-
float * c_X = d_X + i * x_ne;
|
| 616 |
float * c_Y = d_Y + i * y_ne;
|
| 617 |
float * c_D = d_D + i * d_ne;
|
| 618 |
char * c_Q = d_Q + i * q_sz;
|
| 619 |
|
| 620 |
-
// copy src0
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
|
| 626 |
-
|
| 627 |
-
|
| 628 |
|
| 629 |
-
|
| 630 |
-
|
| 631 |
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
// copy dst to host
|
| 642 |
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
|
@@ -645,7 +804,9 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
|
|
| 645 |
}
|
| 646 |
|
| 647 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 648 |
-
|
|
|
|
|
|
|
| 649 |
ggml_cuda_pool_free(d_Y, y_size);
|
| 650 |
ggml_cuda_pool_free(d_D, d_size);
|
| 651 |
ggml_cuda_pool_free(d_Q, q_size);
|
|
@@ -661,8 +822,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
| 661 |
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
| 662 |
src1->type == GGML_TYPE_F32 &&
|
| 663 |
dst->type == GGML_TYPE_F32 &&
|
| 664 |
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
| 665 |
-
|
| 666 |
return true;
|
| 667 |
}
|
| 668 |
|
|
@@ -714,3 +874,25 @@ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct
|
|
| 714 |
return 0;
|
| 715 |
}
|
| 716 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
} \
|
| 33 |
} while (0)
|
| 34 |
|
| 35 |
+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
|
| 36 |
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
| 37 |
+
typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
|
| 38 |
+
|
| 39 |
+
// QK = number of values after dequantization
|
| 40 |
+
// QR = QK / number of values before dequantization
|
| 41 |
|
| 42 |
#define QK4_0 32
|
| 43 |
+
#define QR4_0 2
|
| 44 |
typedef struct {
|
| 45 |
float d; // delta
|
| 46 |
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
|
|
|
| 48 |
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
| 49 |
|
| 50 |
#define QK4_1 32
|
| 51 |
+
#define QR4_1 2
|
| 52 |
typedef struct {
|
| 53 |
float d; // delta
|
| 54 |
float m; // min
|
|
|
|
| 56 |
} block_q4_1;
|
| 57 |
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
#define QK5_0 32
|
| 60 |
+
#define QR5_0 2
|
| 61 |
typedef struct {
|
| 62 |
half d; // delta
|
| 63 |
uint8_t qh[4]; // 5-th bit of quants
|
|
|
|
| 66 |
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
| 67 |
|
| 68 |
#define QK5_1 32
|
| 69 |
+
#define QR5_1 2
|
| 70 |
typedef struct {
|
| 71 |
half d; // delta
|
| 72 |
half m; // min
|
|
|
|
| 76 |
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
| 77 |
|
| 78 |
#define QK8_0 32
|
| 79 |
+
#define QR8_0 1
|
| 80 |
typedef struct {
|
| 81 |
float d; // delta
|
| 82 |
int8_t qs[QK8_0]; // quants
|
| 83 |
} block_q8_0;
|
| 84 |
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
|
| 85 |
|
| 86 |
+
#define CUDA_DMMV_BLOCK_SIZE 32
|
| 87 |
+
|
| 88 |
+
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 89 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 90 |
|
| 91 |
+
const float d = x[ib].d;
|
| 92 |
|
| 93 |
+
const uint8_t vui = x[ib].qs[iqs];
|
| 94 |
+
|
| 95 |
+
const int8_t vi0 = vui & 0xF;
|
| 96 |
+
const int8_t vi1 = vui >> 4;
|
| 97 |
|
| 98 |
+
v0 = (vi0 - 8)*d;
|
| 99 |
+
v1 = (vi1 - 8)*d;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 103 |
+
const block_q4_1 * x = (const block_q4_1 *) vx;
|
| 104 |
|
| 105 |
+
const float d = x[ib].d;
|
| 106 |
+
const float m = x[ib].m;
|
| 107 |
|
| 108 |
+
const uint8_t vui = x[ib].qs[iqs];
|
|
|
|
| 109 |
|
| 110 |
+
const int8_t vi0 = vui & 0xF;
|
| 111 |
+
const int8_t vi1 = vui >> 4;
|
| 112 |
|
| 113 |
+
v0 = vi0*d + m;
|
| 114 |
+
v1 = vi1*d + m;
|
|
|
|
| 115 |
}
|
| 116 |
|
| 117 |
+
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 118 |
+
const block_q5_0 * x = (const block_q5_0 *) vx;
|
| 119 |
|
| 120 |
+
const float d = x[ib].d;
|
| 121 |
|
| 122 |
+
uint32_t qh;
|
| 123 |
+
memcpy(&qh, x[ib].qh, sizeof(qh));
|
| 124 |
+
|
| 125 |
+
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
| 126 |
+
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
| 127 |
|
| 128 |
+
const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
|
| 129 |
+
const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
|
| 130 |
|
| 131 |
+
v0 = x0*d;
|
| 132 |
+
v1 = x1*d;
|
| 133 |
+
}
|
| 134 |
|
| 135 |
+
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 136 |
+
const block_q5_1 * x = (const block_q5_1 *) vx;
|
| 137 |
|
| 138 |
+
const float d = x[ib].d;
|
| 139 |
+
const float m = x[ib].m;
|
| 140 |
|
| 141 |
+
uint32_t qh;
|
| 142 |
+
memcpy(&qh, x[ib].qh, sizeof(qh));
|
| 143 |
+
|
| 144 |
+
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
| 145 |
+
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
| 146 |
+
|
| 147 |
+
const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
| 148 |
+
const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
|
| 149 |
+
|
| 150 |
+
v0 = x0*d + m;
|
| 151 |
+
v1 = x1*d + m;
|
| 152 |
}
|
| 153 |
|
| 154 |
+
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 155 |
+
const block_q8_0 * x = (const block_q8_0 *) vx;
|
| 156 |
+
|
| 157 |
+
const float d = x[ib].d;
|
| 158 |
+
|
| 159 |
+
const int8_t vi0 = x[ib].qs[iqs + 0];
|
| 160 |
+
const int8_t vi1 = x[ib].qs[iqs + 1];
|
| 161 |
+
|
| 162 |
+
v0 = vi0*d;
|
| 163 |
+
v1 = vi1*d;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 167 |
+
const half * x = (const half *) vx;
|
| 168 |
+
|
| 169 |
+
v0 = __half2float(x[ib + 0]);
|
| 170 |
+
v1 = __half2float(x[ib + 1]);
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
|
| 174 |
+
static const int qk = QK4_0;
|
| 175 |
+
|
| 176 |
+
const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 177 |
|
| 178 |
const int i = blockIdx.x;
|
| 179 |
|
| 180 |
const float d = x[i].d;
|
| 181 |
|
| 182 |
+
for (int j = 0; j < qk/2; ++j) {
|
| 183 |
+
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
| 184 |
+
const int x1 = (x[i].qs[j] >> 4) - 8;
|
| 185 |
|
| 186 |
+
y[i*qk + j + 0 ] = x0*d;
|
| 187 |
+
y[i*qk + j + qk/2] = x1*d;
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
|
| 192 |
+
static const int qk = QK4_1;
|
| 193 |
|
| 194 |
+
const block_q4_1 * x = (const block_q4_1 *) vx;
|
|
|
|
| 195 |
|
| 196 |
+
const int i = blockIdx.x;
|
|
|
|
| 197 |
|
| 198 |
+
const float d = x[i].d;
|
| 199 |
+
const float m = x[i].m;
|
| 200 |
+
|
| 201 |
+
for (int j = 0; j < qk/2; ++j) {
|
| 202 |
+
const int x0 = (x[i].qs[j] & 0xf);
|
| 203 |
+
const int x1 = (x[i].qs[j] >> 4);
|
| 204 |
+
|
| 205 |
+
y[i*qk + j + 0 ] = x0*d + m;
|
| 206 |
+
y[i*qk + j + qk/2] = x1*d + m;
|
| 207 |
}
|
| 208 |
}
|
| 209 |
|
| 210 |
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
|
| 211 |
+
static const int qk = QK5_0;
|
| 212 |
+
|
| 213 |
const block_q5_0 * x = (const block_q5_0 *) vx;
|
| 214 |
|
| 215 |
const int i = blockIdx.x;
|
| 216 |
|
| 217 |
const float d = x[i].d;
|
| 218 |
|
|
|
|
|
|
|
| 219 |
uint32_t qh;
|
| 220 |
memcpy(&qh, x[i].qh, sizeof(qh));
|
| 221 |
|
| 222 |
+
for (int j = 0; j < qk/2; ++j) {
|
| 223 |
+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
| 224 |
+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
| 225 |
|
| 226 |
+
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
| 227 |
+
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
| 228 |
|
| 229 |
+
y[i*qk + j + 0 ] = x0*d;
|
| 230 |
+
y[i*qk + j + qk/2] = x1*d;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
}
|
| 233 |
|
| 234 |
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
|
| 235 |
+
static const int qk = QK5_1;
|
| 236 |
+
|
| 237 |
const block_q5_1 * x = (const block_q5_1 *) vx;
|
| 238 |
|
| 239 |
const int i = blockIdx.x;
|
|
|
|
| 241 |
const float d = x[i].d;
|
| 242 |
const float m = x[i].m;
|
| 243 |
|
|
|
|
|
|
|
| 244 |
uint32_t qh;
|
| 245 |
memcpy(&qh, x[i].qh, sizeof(qh));
|
| 246 |
|
| 247 |
+
for (int j = 0; j < qk/2; ++j) {
|
| 248 |
+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
| 249 |
+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
| 250 |
|
| 251 |
+
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
| 252 |
+
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
| 253 |
|
| 254 |
+
y[i*qk + j + 0 ] = x0*d + m;
|
| 255 |
+
y[i*qk + j + qk/2] = x1*d + m;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
}
|
| 257 |
}
|
| 258 |
|
| 259 |
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
|
| 260 |
+
static const int qk = QK8_0;
|
| 261 |
+
|
| 262 |
const block_q8_0 * x = (const block_q8_0 *) vx;
|
| 263 |
|
| 264 |
const int i = blockIdx.x;
|
| 265 |
|
| 266 |
const float d = x[i].d;
|
| 267 |
|
| 268 |
+
for (int j = 0; j < qk; ++j) {
|
| 269 |
+
y[i*qk + j] = x[i].qs[j]*d;
|
| 270 |
+
}
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
| 274 |
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
|
| 275 |
+
const int row = blockIdx.x;
|
| 276 |
+
const int tid = threadIdx.x;
|
| 277 |
+
|
| 278 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 279 |
|
| 280 |
+
__shared__ float tmp[block_size]; // separate sum for each thread
|
| 281 |
+
tmp[tid] = 0;
|
| 282 |
|
| 283 |
+
for (int i = 0; i < ncols/block_size; i += 2) {
|
| 284 |
+
const int col = i*block_size + 2*tid;
|
| 285 |
+
const int ib = (row*ncols + col)/qk; // block index
|
| 286 |
+
const int iqs = (col%qk)/qr; // quant index
|
| 287 |
+
const int iybs = col - col%qk; // y block start index
|
| 288 |
+
|
| 289 |
+
// dequantize
|
| 290 |
+
float v0, v1;
|
| 291 |
+
dequantize_kernel(vx, ib, iqs, v0, v1);
|
| 292 |
+
|
| 293 |
+
// matrix multiplication
|
| 294 |
+
tmp[tid] += v0 * y[iybs + iqs + 0];
|
| 295 |
+
tmp[tid] += v1 * y[iybs + iqs + y_offset];
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
// sum up partial sums and write back result
|
| 299 |
+
__syncthreads();
|
| 300 |
+
for (int s=block_size/2; s>0; s>>=1) {
|
| 301 |
+
if (tid < s) {
|
| 302 |
+
tmp[tid] += tmp[tid + s];
|
| 303 |
+
}
|
| 304 |
+
__syncthreads();
|
| 305 |
+
}
|
| 306 |
+
if (tid == 0) {
|
| 307 |
+
dst[row] = tmp[0];
|
| 308 |
}
|
| 309 |
}
|
| 310 |
|
|
|
|
| 318 |
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
|
| 319 |
}
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
| 322 |
const int nb = k / QK5_0;
|
| 323 |
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
|
|
|
|
| 333 |
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
|
| 334 |
}
|
| 335 |
|
| 336 |
+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 337 |
+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
|
| 338 |
+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
|
| 339 |
+
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 343 |
+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
|
| 344 |
+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
|
| 345 |
+
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 349 |
+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
|
| 350 |
+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
|
| 351 |
+
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 355 |
+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
|
| 356 |
+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
|
| 357 |
+
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 361 |
+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
|
| 362 |
+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
|
| 363 |
+
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
// TODO: optimize
|
| 367 |
static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
|
| 368 |
const half * x = (const half *) vx;
|
|
|
|
| 376 |
convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
|
| 377 |
}
|
| 378 |
|
| 379 |
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 380 |
+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
|
| 381 |
+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
|
| 382 |
+
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
| 386 |
switch (type) {
|
| 387 |
case GGML_TYPE_Q4_0:
|
| 388 |
return dequantize_row_q4_0_cuda;
|
| 389 |
case GGML_TYPE_Q4_1:
|
| 390 |
return dequantize_row_q4_1_cuda;
|
|
|
|
|
|
|
| 391 |
case GGML_TYPE_Q5_0:
|
| 392 |
return dequantize_row_q5_0_cuda;
|
| 393 |
case GGML_TYPE_Q5_1:
|
|
|
|
| 401 |
}
|
| 402 |
}
|
| 403 |
|
| 404 |
+
static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
|
| 405 |
+
switch (type) {
|
| 406 |
+
case GGML_TYPE_Q4_0:
|
| 407 |
+
return dequantize_mul_mat_vec_q4_0_cuda;
|
| 408 |
+
case GGML_TYPE_Q4_1:
|
| 409 |
+
return dequantize_mul_mat_vec_q4_1_cuda;
|
| 410 |
+
case GGML_TYPE_Q5_0:
|
| 411 |
+
return dequantize_mul_mat_vec_q5_0_cuda;
|
| 412 |
+
case GGML_TYPE_Q5_1:
|
| 413 |
+
return dequantize_mul_mat_vec_q5_1_cuda;
|
| 414 |
+
case GGML_TYPE_Q8_0:
|
| 415 |
+
return dequantize_mul_mat_vec_q8_0_cuda;
|
| 416 |
+
case GGML_TYPE_F16:
|
| 417 |
+
return convert_mul_mat_vec_f16_cuda;
|
| 418 |
+
default:
|
| 419 |
+
return nullptr;
|
| 420 |
+
}
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
// buffer pool for cuda
|
| 424 |
+
#define MAX_CUDA_BUFFERS 256
|
| 425 |
|
| 426 |
struct scoped_spin_lock {
|
| 427 |
std::atomic_flag& lock;
|
|
|
|
| 479 |
CUDA_CHECK(cudaFree(ptr));
|
| 480 |
}
|
| 481 |
|
| 482 |
+
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
| 483 |
#define GGML_CUDA_MAX_EVENTS 64
|
| 484 |
static cublasHandle_t g_cublasH = nullptr;
|
| 485 |
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
|
|
|
|
| 718 |
const int nb2 = dst->nb[2];
|
| 719 |
const int nb3 = dst->nb[3];
|
| 720 |
const ggml_type type = src0->type;
|
| 721 |
+
const bool mul_mat_vec = ne11 == 1;
|
| 722 |
|
| 723 |
const float alpha = 1.0f;
|
| 724 |
const float beta = 0.0f;
|
|
|
|
| 729 |
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
|
| 730 |
|
| 731 |
size_t x_size, y_size, d_size, q_size;
|
| 732 |
+
float * d_X = nullptr;
|
| 733 |
+
if (!mul_mat_vec) {
|
| 734 |
+
d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
|
| 735 |
+
}
|
| 736 |
float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
|
| 737 |
float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
|
| 738 |
char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
|
| 739 |
|
| 740 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
|
| 741 |
+
dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
|
| 742 |
GGML_ASSERT(to_fp32_cuda != nullptr);
|
| 743 |
|
| 744 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
|
|
| 748 |
cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
|
| 749 |
cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
|
| 750 |
|
|
|
|
| 751 |
float * c_Y = d_Y + i * y_ne;
|
| 752 |
float * c_D = d_D + i * d_ne;
|
| 753 |
char * c_Q = d_Q + i * q_sz;
|
| 754 |
|
| 755 |
+
// copy src0 to device if necessary
|
| 756 |
+
if (src0->backend == GGML_BACKEND_CPU) {
|
| 757 |
+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
|
| 758 |
+
} else if (src0->backend == GGML_BACKEND_CUDA) {
|
| 759 |
+
c_Q = ((char *) src0->data) + i * q_sz;
|
| 760 |
+
} else {
|
| 761 |
+
GGML_ASSERT(false);
|
| 762 |
+
}
|
| 763 |
+
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
| 764 |
+
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
|
| 765 |
|
| 766 |
+
// copy src1 to device
|
| 767 |
+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
|
| 768 |
|
| 769 |
+
// wait for data
|
| 770 |
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
|
| 771 |
|
| 772 |
+
// compute
|
| 773 |
+
dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
|
| 774 |
+
CUDA_CHECK(cudaGetLastError());
|
| 775 |
+
|
| 776 |
+
} else { // general dequantization kernel + cuBLAS matrix matrix multiplication
|
| 777 |
+
float * c_X = d_X + i * x_ne;
|
| 778 |
+
|
| 779 |
+
// convert src0 to fp32 on device
|
| 780 |
+
to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
|
| 781 |
+
CUDA_CHECK(cudaGetLastError());
|
| 782 |
+
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
|
| 783 |
+
|
| 784 |
+
// copy src1 to device
|
| 785 |
+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
|
| 786 |
+
|
| 787 |
+
// wait for conversion
|
| 788 |
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
|
| 789 |
+
|
| 790 |
+
// compute
|
| 791 |
+
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
|
| 792 |
+
CUBLAS_CHECK(
|
| 793 |
+
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
| 794 |
+
ne01, ne11, ne10,
|
| 795 |
+
&alpha, c_X, ne00,
|
| 796 |
+
c_Y, ne10,
|
| 797 |
+
&beta, c_D, ne01));
|
| 798 |
+
}
|
| 799 |
|
| 800 |
// copy dst to host
|
| 801 |
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
|
|
|
| 804 |
}
|
| 805 |
|
| 806 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 807 |
+
if (!mul_mat_vec) {
|
| 808 |
+
ggml_cuda_pool_free(d_X, x_size);
|
| 809 |
+
}
|
| 810 |
ggml_cuda_pool_free(d_Y, y_size);
|
| 811 |
ggml_cuda_pool_free(d_D, d_size);
|
| 812 |
ggml_cuda_pool_free(d_Q, q_size);
|
|
|
|
| 822 |
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
| 823 |
src1->type == GGML_TYPE_F32 &&
|
| 824 |
dst->type == GGML_TYPE_F32 &&
|
| 825 |
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
|
|
|
|
| 826 |
return true;
|
| 827 |
}
|
| 828 |
|
|
|
|
| 874 |
return 0;
|
| 875 |
}
|
| 876 |
}
|
| 877 |
+
|
| 878 |
+
void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
|
| 879 |
+
const int64_t ne0 = tensor->ne[0];
|
| 880 |
+
const int64_t ne1 = tensor->ne[1];
|
| 881 |
+
const int64_t ne2 = tensor->ne[2];
|
| 882 |
+
const int64_t ne3 = tensor->ne[3];
|
| 883 |
+
|
| 884 |
+
const ggml_type type = tensor->type;
|
| 885 |
+
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
| 886 |
+
|
| 887 |
+
size_t q_size;
|
| 888 |
+
char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
|
| 889 |
+
|
| 890 |
+
cudaStream_t cudaStream2 = g_cudaStreams2[0];
|
| 891 |
+
|
| 892 |
+
// copy tensor to device
|
| 893 |
+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
|
| 894 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 895 |
+
|
| 896 |
+
tensor->data = d_Q;
|
| 897 |
+
tensor->backend = GGML_BACKEND_CUDA;
|
| 898 |
+
}
|
ggml-cuda.h
CHANGED
|
@@ -14,6 +14,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
| 14 |
void * ggml_cuda_host_malloc(size_t size);
|
| 15 |
void ggml_cuda_host_free(void * ptr);
|
| 16 |
|
|
|
|
|
|
|
| 17 |
#ifdef __cplusplus
|
| 18 |
}
|
| 19 |
#endif
|
|
|
|
| 14 |
void * ggml_cuda_host_malloc(size_t size);
|
| 15 |
void ggml_cuda_host_free(void * ptr);
|
| 16 |
|
| 17 |
+
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
| 18 |
+
|
| 19 |
#ifdef __cplusplus
|
| 20 |
}
|
| 21 |
#endif
|
ggml-opencl.c
CHANGED
|
@@ -12,129 +12,129 @@
|
|
| 12 |
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
| 13 |
const char * clblast_dequant = MULTILINE_QUOTE(
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
struct block_q4_0
|
| 16 |
{
|
| 17 |
float d;
|
| 18 |
-
|
| 19 |
};
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
|
|
|
| 35 |
{
|
| 36 |
float d;
|
| 37 |
-
|
| 38 |
-
uchar qs[16];
|
| 39 |
};
|
| 40 |
|
| 41 |
-
__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
|
| 42 |
-
const uint i = get_global_id(0) / 32;
|
| 43 |
-
const uint l = get_local_id(0);
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
const
|
|
|
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
result[index + 1] = (vi >> 4) * d + m;
|
| 53 |
}
|
| 54 |
|
| 55 |
-
struct
|
| 56 |
-
|
| 57 |
-
ushort d;
|
| 58 |
-
uchar qs[8];
|
| 59 |
-
};
|
| 60 |
|
| 61 |
-
|
| 62 |
-
const uint
|
| 63 |
-
const uint l = get_local_id(0);
|
| 64 |
|
| 65 |
-
const float d =
|
|
|
|
| 66 |
|
| 67 |
-
const
|
|
|
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
result[index + 1] = ((vi >> 4) - 8)*d;
|
| 72 |
}
|
| 73 |
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
float d;
|
| 78 |
-
uint qh;
|
| 79 |
-
uchar qs[16];
|
| 80 |
-
};
|
| 81 |
|
| 82 |
-
|
| 83 |
-
const uint i = get_global_id(0) / 32;
|
| 84 |
-
const uint l = get_local_id(0);
|
| 85 |
|
| 86 |
-
|
| 87 |
|
| 88 |
-
const
|
|
|
|
| 89 |
|
| 90 |
-
const
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
const uint index = i*32 + l2;
|
| 96 |
-
result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
|
| 97 |
-
result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
|
| 98 |
}
|
| 99 |
|
| 100 |
-
struct block_q5_1
|
| 101 |
-
|
| 102 |
-
ushort d;
|
| 103 |
-
ushort m;
|
| 104 |
-
uint qh;
|
| 105 |
-
uchar qs[16];
|
| 106 |
-
};
|
| 107 |
|
| 108 |
-
|
| 109 |
-
const uint
|
| 110 |
-
const uint l = get_local_id(0);
|
| 111 |
|
| 112 |
-
const float d = vload_half(0, (__global half*) &
|
| 113 |
-
const float m = vload_half(0, (__global half*) &
|
| 114 |
|
| 115 |
-
|
| 116 |
|
| 117 |
-
const
|
|
|
|
| 118 |
|
| 119 |
-
const
|
| 120 |
-
const
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
result[index + 1] = ((vi >> 4) | vh1)*d + m;
|
| 125 |
}
|
| 126 |
|
| 127 |
-
struct block_q8_0
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
};
|
| 132 |
-
|
| 133 |
-
__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
|
| 134 |
-
const uint i = get_global_id(0) / 32;
|
| 135 |
-
const uint l = get_local_id(0);
|
| 136 |
|
| 137 |
-
|
|
|
|
| 138 |
}
|
| 139 |
|
| 140 |
);
|
|
@@ -148,26 +148,12 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
|
|
| 148 |
} \
|
| 149 |
} while (0)
|
| 150 |
|
| 151 |
-
#define QK5_0 32
|
| 152 |
-
typedef struct {
|
| 153 |
-
ggml_fp16_t d; // delta
|
| 154 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 155 |
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
| 156 |
-
} block_q5_0;
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
typedef struct {
|
| 160 |
-
float d; // delta
|
| 161 |
-
uint32_t qh; // 5-th bit of quants
|
| 162 |
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
| 163 |
-
} cl_block_q5_0;
|
| 164 |
-
|
| 165 |
static cl_platform_id platform;
|
| 166 |
static cl_device_id device;
|
| 167 |
static cl_context context;
|
| 168 |
static cl_command_queue queue;
|
| 169 |
static cl_program program;
|
| 170 |
-
static cl_kernel kernel_q4_0, kernel_q4_1,
|
| 171 |
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
| 172 |
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
| 173 |
|
|
@@ -238,8 +224,6 @@ void ggml_cl_init(void) {
|
|
| 238 |
CL_CHECK(err, "clCreateKernel");
|
| 239 |
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
| 240 |
CL_CHECK(err, "clCreateKernel");
|
| 241 |
-
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
|
| 242 |
-
CL_CHECK(err, "clCreateKernel");
|
| 243 |
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
| 244 |
CL_CHECK(err, "clCreateKernel");
|
| 245 |
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
|
@@ -274,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
|
|
| 274 |
cl_kernel kernel;
|
| 275 |
size_t global = n * k, local, size_qb;
|
| 276 |
bool dequant;
|
| 277 |
-
cl_block_q5_0* cl_host_b;
|
| 278 |
|
| 279 |
switch (btype) {
|
| 280 |
case GGML_TYPE_F32:
|
|
@@ -292,28 +275,11 @@ void ggml_cl_sgemm_wrapper(
|
|
| 292 |
local = 16;
|
| 293 |
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
| 294 |
break;
|
| 295 |
-
case GGML_TYPE_Q4_2:
|
| 296 |
-
dequant = true;
|
| 297 |
-
kernel = kernel_q4_2;
|
| 298 |
-
local = 8;
|
| 299 |
-
size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
|
| 300 |
-
break;
|
| 301 |
case GGML_TYPE_Q5_0:
|
| 302 |
dequant = true;
|
| 303 |
kernel = kernel_q5_0;
|
| 304 |
local = 16;
|
| 305 |
-
|
| 306 |
-
// 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
|
| 307 |
-
// TODO Find the reason, fix and remove workaround.
|
| 308 |
-
const block_q5_0* b = (const block_q5_0*) host_b;
|
| 309 |
-
cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
|
| 310 |
-
for (size_t i = 0; i < global / 32; i++) {
|
| 311 |
-
cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
|
| 312 |
-
memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
|
| 313 |
-
memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
|
| 314 |
-
}
|
| 315 |
-
host_b = (const float*) cl_host_b;
|
| 316 |
-
size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
|
| 317 |
break;
|
| 318 |
case GGML_TYPE_Q5_1:
|
| 319 |
dequant = true;
|
|
@@ -392,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
|
|
| 392 |
clWaitForEvents(1, &ev_c);
|
| 393 |
clReleaseEvent(ev_sgemm);
|
| 394 |
clReleaseEvent(ev_c);
|
| 395 |
-
if (btype == GGML_TYPE_Q5_0) {
|
| 396 |
-
free((void*) cl_host_b);
|
| 397 |
-
}
|
| 398 |
}
|
|
|
|
| 12 |
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
| 13 |
const char * clblast_dequant = MULTILINE_QUOTE(
|
| 14 |
|
| 15 |
+
typedef uchar uint8_t;
|
| 16 |
+
typedef int int32_t;
|
| 17 |
+
typedef uint uint32_t;
|
| 18 |
+
|
| 19 |
+
constant uint QK4_0 = 32;
|
| 20 |
struct block_q4_0
|
| 21 |
{
|
| 22 |
float d;
|
| 23 |
+
uint8_t qs[QK4_0 / 2];
|
| 24 |
};
|
| 25 |
|
| 26 |
+
constant uint QK4_1 = 32;
|
| 27 |
+
struct block_q4_1
|
| 28 |
+
{
|
| 29 |
+
float d;
|
| 30 |
+
float m;
|
| 31 |
+
uint8_t qs[QK4_1 / 2];
|
| 32 |
+
};
|
| 33 |
|
| 34 |
+
constant uint QK5_0 = 32;
|
| 35 |
+
struct __attribute__ ((packed)) block_q5_0
|
| 36 |
+
{
|
| 37 |
+
half d;
|
| 38 |
+
uint32_t qh;
|
| 39 |
+
uint8_t qs[QK5_0 / 2];
|
| 40 |
+
};
|
| 41 |
|
| 42 |
+
constant uint QK5_1 = 32;
|
| 43 |
+
struct block_q5_1
|
| 44 |
+
{
|
| 45 |
+
half d;
|
| 46 |
+
half m;
|
| 47 |
+
uint32_t qh;
|
| 48 |
+
uint8_t qs[QK5_1 / 2];
|
| 49 |
+
};
|
| 50 |
|
| 51 |
+
constant uint QK8_0 = 32;
|
| 52 |
+
struct block_q8_0
|
| 53 |
{
|
| 54 |
float d;
|
| 55 |
+
uint8_t qs[QK8_0];
|
|
|
|
| 56 |
};
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
|
| 60 |
+
constant uint qk = QK4_0;
|
| 61 |
+
|
| 62 |
+
const uint i = get_global_id(0) / qk;
|
| 63 |
+
const uint j = get_local_id(0);
|
| 64 |
+
|
| 65 |
+
const float d = x[i].d;
|
| 66 |
|
| 67 |
+
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
| 68 |
+
const int x1 = (x[i].qs[j] >> 4) - 8;
|
| 69 |
|
| 70 |
+
y[i*qk + j + 0 ] = x0*d;
|
| 71 |
+
y[i*qk + j + qk/2] = x1*d;
|
|
|
|
| 72 |
}
|
| 73 |
|
| 74 |
+
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
|
| 75 |
+
constant uint qk = QK4_1;
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
const uint i = get_global_id(0) / qk;
|
| 78 |
+
const uint j = get_local_id(0);
|
|
|
|
| 79 |
|
| 80 |
+
const float d = x[i].d;
|
| 81 |
+
const float m = x[i].m;
|
| 82 |
|
| 83 |
+
const int x0 = (x[i].qs[j] & 0xf);
|
| 84 |
+
const int x1 = (x[i].qs[j] >> 4);
|
| 85 |
|
| 86 |
+
y[i*qk + j + 0 ] = x0*d + m;
|
| 87 |
+
y[i*qk + j + qk/2] = x1*d + m;
|
|
|
|
| 88 |
}
|
| 89 |
|
| 90 |
+
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
|
| 91 |
+
constant uint qk = QK5_0;
|
| 92 |
|
| 93 |
+
const uint i = get_global_id(0) / qk;
|
| 94 |
+
const uint j = get_local_id(0);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
const float d = vload_half(0, (__global half*) &x[i].d);
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
uint32_t qh = x[i].qh;
|
| 99 |
|
| 100 |
+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
| 101 |
+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
| 102 |
|
| 103 |
+
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
| 104 |
+
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
| 105 |
|
| 106 |
+
y[i*qk + j + 0 ] = x0*d;
|
| 107 |
+
y[i*qk + j + qk/2] = x1*d;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
}
|
| 109 |
|
| 110 |
+
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
|
| 111 |
+
constant uint qk = QK5_1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
const uint i = get_global_id(0) / qk;
|
| 114 |
+
const uint j = get_local_id(0);
|
|
|
|
| 115 |
|
| 116 |
+
const float d = vload_half(0, (__global half*) &x[i].d);
|
| 117 |
+
const float m = vload_half(0, (__global half*) &x[i].m);
|
| 118 |
|
| 119 |
+
uint32_t qh = x[i].qh;
|
| 120 |
|
| 121 |
+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
| 122 |
+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
| 123 |
|
| 124 |
+
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
| 125 |
+
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
| 126 |
|
| 127 |
+
y[i*qk + j + 0 ] = x0*d + m;
|
| 128 |
+
y[i*qk + j + qk/2] = x1*d + m;
|
|
|
|
| 129 |
}
|
| 130 |
|
| 131 |
+
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
|
| 132 |
+
constant uint qk = QK8_0;
|
| 133 |
+
const uint i = get_global_id(0) / qk;
|
| 134 |
+
const uint j = get_local_id(0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
+
const float d = x[i].d;
|
| 137 |
+
y[i*qk + j] = x[i].qs[j]*d;
|
| 138 |
}
|
| 139 |
|
| 140 |
);
|
|
|
|
| 148 |
} \
|
| 149 |
} while (0)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
static cl_platform_id platform;
|
| 152 |
static cl_device_id device;
|
| 153 |
static cl_context context;
|
| 154 |
static cl_command_queue queue;
|
| 155 |
static cl_program program;
|
| 156 |
+
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
|
| 157 |
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
| 158 |
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
| 159 |
|
|
|
|
| 224 |
CL_CHECK(err, "clCreateKernel");
|
| 225 |
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
| 226 |
CL_CHECK(err, "clCreateKernel");
|
|
|
|
|
|
|
| 227 |
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
| 228 |
CL_CHECK(err, "clCreateKernel");
|
| 229 |
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
|
|
|
| 258 |
cl_kernel kernel;
|
| 259 |
size_t global = n * k, local, size_qb;
|
| 260 |
bool dequant;
|
|
|
|
| 261 |
|
| 262 |
switch (btype) {
|
| 263 |
case GGML_TYPE_F32:
|
|
|
|
| 275 |
local = 16;
|
| 276 |
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
| 277 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
case GGML_TYPE_Q5_0:
|
| 279 |
dequant = true;
|
| 280 |
kernel = kernel_q5_0;
|
| 281 |
local = 16;
|
| 282 |
+
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
break;
|
| 284 |
case GGML_TYPE_Q5_1:
|
| 285 |
dequant = true;
|
|
|
|
| 358 |
clWaitForEvents(1, &ev_c);
|
| 359 |
clReleaseEvent(ev_sgemm);
|
| 360 |
clReleaseEvent(ev_c);
|
|
|
|
|
|
|
|
|
|
| 361 |
}
|
ggml.c
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml.h
CHANGED
|
@@ -190,9 +190,12 @@
|
|
| 190 |
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
| 191 |
#define GGML_FILE_VERSION 1
|
| 192 |
|
|
|
|
|
|
|
|
|
|
| 193 |
#define GGML_MAX_DIMS 4
|
| 194 |
#define GGML_MAX_NODES 4096
|
| 195 |
-
#define GGML_MAX_PARAMS
|
| 196 |
#define GGML_MAX_CONTEXTS 64
|
| 197 |
#define GGML_MAX_OPT 4
|
| 198 |
#define GGML_DEFAULT_N_THREADS 4
|
|
@@ -231,7 +234,7 @@ extern "C" {
|
|
| 231 |
GGML_TYPE_F16 = 1,
|
| 232 |
GGML_TYPE_Q4_0 = 2,
|
| 233 |
GGML_TYPE_Q4_1 = 3,
|
| 234 |
-
GGML_TYPE_Q4_2 = 4,
|
| 235 |
// GGML_TYPE_Q4_3 (5) support has been removed
|
| 236 |
GGML_TYPE_Q5_0 = 6,
|
| 237 |
GGML_TYPE_Q5_1 = 7,
|
|
@@ -243,6 +246,11 @@ extern "C" {
|
|
| 243 |
GGML_TYPE_COUNT,
|
| 244 |
};
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
// model file types
|
| 247 |
enum ggml_ftype {
|
| 248 |
GGML_FTYPE_UNKNOWN = -1,
|
|
@@ -251,7 +259,6 @@ extern "C" {
|
|
| 251 |
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 252 |
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 253 |
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 254 |
-
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
| 255 |
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 256 |
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 257 |
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
|
@@ -263,12 +270,16 @@ extern "C" {
|
|
| 263 |
|
| 264 |
GGML_OP_DUP,
|
| 265 |
GGML_OP_ADD,
|
|
|
|
|
|
|
| 266 |
GGML_OP_SUB,
|
| 267 |
GGML_OP_MUL,
|
| 268 |
GGML_OP_DIV,
|
| 269 |
GGML_OP_SQR,
|
| 270 |
GGML_OP_SQRT,
|
|
|
|
| 271 |
GGML_OP_SUM,
|
|
|
|
| 272 |
GGML_OP_MEAN,
|
| 273 |
GGML_OP_REPEAT,
|
| 274 |
GGML_OP_ABS,
|
|
@@ -278,12 +289,15 @@ extern "C" {
|
|
| 278 |
GGML_OP_RELU,
|
| 279 |
GGML_OP_GELU,
|
| 280 |
GGML_OP_SILU,
|
|
|
|
| 281 |
GGML_OP_NORM, // normalize
|
| 282 |
GGML_OP_RMS_NORM,
|
|
|
|
| 283 |
|
| 284 |
GGML_OP_MUL_MAT,
|
| 285 |
|
| 286 |
GGML_OP_SCALE,
|
|
|
|
| 287 |
GGML_OP_CPY,
|
| 288 |
GGML_OP_CONT,
|
| 289 |
GGML_OP_RESHAPE,
|
|
@@ -291,9 +305,13 @@ extern "C" {
|
|
| 291 |
GGML_OP_PERMUTE,
|
| 292 |
GGML_OP_TRANSPOSE,
|
| 293 |
GGML_OP_GET_ROWS,
|
|
|
|
|
|
|
| 294 |
GGML_OP_DIAG_MASK_INF,
|
|
|
|
| 295 |
GGML_OP_SOFT_MAX,
|
| 296 |
GGML_OP_ROPE,
|
|
|
|
| 297 |
GGML_OP_ALIBI,
|
| 298 |
GGML_OP_CONV_1D_1S,
|
| 299 |
GGML_OP_CONV_1D_2S,
|
|
@@ -322,7 +340,8 @@ extern "C" {
|
|
| 322 |
|
| 323 |
// n-dimensional tensor
|
| 324 |
struct ggml_tensor {
|
| 325 |
-
enum ggml_type
|
|
|
|
| 326 |
|
| 327 |
int n_dims;
|
| 328 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
|
@@ -353,7 +372,7 @@ extern "C" {
|
|
| 353 |
|
| 354 |
char name[32];
|
| 355 |
|
| 356 |
-
char padding[
|
| 357 |
};
|
| 358 |
|
| 359 |
// computation graph
|
|
@@ -497,6 +516,29 @@ extern "C" {
|
|
| 497 |
struct ggml_tensor * a,
|
| 498 |
struct ggml_tensor * b);
|
| 499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
GGML_API struct ggml_tensor * ggml_sub(
|
| 501 |
struct ggml_context * ctx,
|
| 502 |
struct ggml_tensor * a,
|
|
@@ -520,12 +562,24 @@ extern "C" {
|
|
| 520 |
struct ggml_context * ctx,
|
| 521 |
struct ggml_tensor * a);
|
| 522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
// return scalar
|
| 524 |
-
// TODO: compute sum along rows
|
| 525 |
GGML_API struct ggml_tensor * ggml_sum(
|
| 526 |
struct ggml_context * ctx,
|
| 527 |
struct ggml_tensor * a);
|
| 528 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
// mean along rows
|
| 530 |
GGML_API struct ggml_tensor * ggml_mean(
|
| 531 |
struct ggml_context * ctx,
|
|
@@ -567,6 +621,13 @@ extern "C" {
|
|
| 567 |
struct ggml_context * ctx,
|
| 568 |
struct ggml_tensor * a);
|
| 569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
// normalize along rows
|
| 571 |
// TODO: eps is hardcoded to 1e-5 for now
|
| 572 |
GGML_API struct ggml_tensor * ggml_norm(
|
|
@@ -577,6 +638,13 @@ extern "C" {
|
|
| 577 |
struct ggml_context * ctx,
|
| 578 |
struct ggml_tensor * a);
|
| 579 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
// A: m rows, n columns
|
| 581 |
// B: p rows, n columns (i.e. we transpose it internally)
|
| 582 |
// result is m columns, p rows
|
|
@@ -589,12 +657,66 @@ extern "C" {
|
|
| 589 |
// operations on tensors without backpropagation
|
| 590 |
//
|
| 591 |
|
| 592 |
-
// in-place, returns view(a)
|
| 593 |
GGML_API struct ggml_tensor * ggml_scale(
|
| 594 |
struct ggml_context * ctx,
|
| 595 |
struct ggml_tensor * a,
|
| 596 |
struct ggml_tensor * b);
|
| 597 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
// a -> b, return view(b)
|
| 599 |
GGML_API struct ggml_tensor * ggml_cpy(
|
| 600 |
struct ggml_context * ctx,
|
|
@@ -615,6 +737,11 @@ extern "C" {
|
|
| 615 |
|
| 616 |
// return view(a)
|
| 617 |
// TODO: when we start computing gradient, make a copy instead of view
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
| 619 |
struct ggml_context * ctx,
|
| 620 |
struct ggml_tensor * a,
|
|
@@ -630,6 +757,14 @@ extern "C" {
|
|
| 630 |
int64_t ne1,
|
| 631 |
int64_t ne2);
|
| 632 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
// offset in bytes
|
| 634 |
GGML_API struct ggml_tensor * ggml_view_1d(
|
| 635 |
struct ggml_context * ctx,
|
|
@@ -655,6 +790,18 @@ extern "C" {
|
|
| 655 |
size_t nb2, // slice stride in bytes
|
| 656 |
size_t offset);
|
| 657 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
GGML_API struct ggml_tensor * ggml_permute(
|
| 659 |
struct ggml_context * ctx,
|
| 660 |
struct ggml_tensor * a,
|
|
@@ -673,20 +820,50 @@ extern "C" {
|
|
| 673 |
struct ggml_tensor * a,
|
| 674 |
struct ggml_tensor * b);
|
| 675 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
// set elements above the diagonal to -INF
|
| 677 |
-
// in-place, returns view(a)
|
| 678 |
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
| 679 |
struct ggml_context * ctx,
|
| 680 |
struct ggml_tensor * a,
|
| 681 |
int n_past);
|
| 682 |
|
| 683 |
// in-place, returns view(a)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 684 |
GGML_API struct ggml_tensor * ggml_soft_max(
|
| 685 |
struct ggml_context * ctx,
|
| 686 |
struct ggml_tensor * a);
|
| 687 |
|
| 688 |
-
// rotary position embedding
|
| 689 |
// in-place, returns view(a)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
// if mode & 1 == 1, skip n_past elements
|
| 691 |
// if mode & 2 == 1, GPT-NeoX style
|
| 692 |
// TODO: avoid creating a new tensor every time
|
|
@@ -697,6 +874,23 @@ extern "C" {
|
|
| 697 |
int n_dims,
|
| 698 |
int mode);
|
| 699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
// alibi position embedding
|
| 701 |
// in-place, returns view(a)
|
| 702 |
struct ggml_tensor * ggml_alibi(
|
|
@@ -741,13 +935,13 @@ extern "C" {
|
|
| 741 |
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
| 742 |
struct ggml_context * ctx,
|
| 743 |
struct ggml_tensor * a,
|
| 744 |
-
|
| 745 |
|
| 746 |
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
| 747 |
struct ggml_context * ctx,
|
| 748 |
struct ggml_tensor * a,
|
| 749 |
struct ggml_tensor * b,
|
| 750 |
-
|
| 751 |
|
| 752 |
//
|
| 753 |
// automatic differentiation
|
|
@@ -876,7 +1070,6 @@ extern "C" {
|
|
| 876 |
|
| 877 |
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 878 |
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 879 |
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 880 |
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 881 |
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 882 |
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
|
|
| 190 |
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
| 191 |
#define GGML_FILE_VERSION 1
|
| 192 |
|
| 193 |
+
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
|
| 194 |
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
| 195 |
+
|
| 196 |
#define GGML_MAX_DIMS 4
|
| 197 |
#define GGML_MAX_NODES 4096
|
| 198 |
+
#define GGML_MAX_PARAMS 256
|
| 199 |
#define GGML_MAX_CONTEXTS 64
|
| 200 |
#define GGML_MAX_OPT 4
|
| 201 |
#define GGML_DEFAULT_N_THREADS 4
|
|
|
|
| 234 |
GGML_TYPE_F16 = 1,
|
| 235 |
GGML_TYPE_Q4_0 = 2,
|
| 236 |
GGML_TYPE_Q4_1 = 3,
|
| 237 |
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
| 238 |
// GGML_TYPE_Q4_3 (5) support has been removed
|
| 239 |
GGML_TYPE_Q5_0 = 6,
|
| 240 |
GGML_TYPE_Q5_1 = 7,
|
|
|
|
| 246 |
GGML_TYPE_COUNT,
|
| 247 |
};
|
| 248 |
|
| 249 |
+
enum ggml_backend {
|
| 250 |
+
GGML_BACKEND_CPU = 0,
|
| 251 |
+
GGML_BACKEND_CUDA = 1,
|
| 252 |
+
};
|
| 253 |
+
|
| 254 |
// model file types
|
| 255 |
enum ggml_ftype {
|
| 256 |
GGML_FTYPE_UNKNOWN = -1,
|
|
|
|
| 259 |
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 260 |
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 261 |
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
|
|
|
| 262 |
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 263 |
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 264 |
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
|
|
|
| 270 |
|
| 271 |
GGML_OP_DUP,
|
| 272 |
GGML_OP_ADD,
|
| 273 |
+
GGML_OP_ADD1,
|
| 274 |
+
GGML_OP_ACC,
|
| 275 |
GGML_OP_SUB,
|
| 276 |
GGML_OP_MUL,
|
| 277 |
GGML_OP_DIV,
|
| 278 |
GGML_OP_SQR,
|
| 279 |
GGML_OP_SQRT,
|
| 280 |
+
GGML_OP_LOG,
|
| 281 |
GGML_OP_SUM,
|
| 282 |
+
GGML_OP_SUM_ROWS,
|
| 283 |
GGML_OP_MEAN,
|
| 284 |
GGML_OP_REPEAT,
|
| 285 |
GGML_OP_ABS,
|
|
|
|
| 289 |
GGML_OP_RELU,
|
| 290 |
GGML_OP_GELU,
|
| 291 |
GGML_OP_SILU,
|
| 292 |
+
GGML_OP_SILU_BACK,
|
| 293 |
GGML_OP_NORM, // normalize
|
| 294 |
GGML_OP_RMS_NORM,
|
| 295 |
+
GGML_OP_RMS_NORM_BACK,
|
| 296 |
|
| 297 |
GGML_OP_MUL_MAT,
|
| 298 |
|
| 299 |
GGML_OP_SCALE,
|
| 300 |
+
GGML_OP_SET,
|
| 301 |
GGML_OP_CPY,
|
| 302 |
GGML_OP_CONT,
|
| 303 |
GGML_OP_RESHAPE,
|
|
|
|
| 305 |
GGML_OP_PERMUTE,
|
| 306 |
GGML_OP_TRANSPOSE,
|
| 307 |
GGML_OP_GET_ROWS,
|
| 308 |
+
GGML_OP_GET_ROWS_BACK,
|
| 309 |
+
GGML_OP_DIAG,
|
| 310 |
GGML_OP_DIAG_MASK_INF,
|
| 311 |
+
GGML_OP_DIAG_MASK_ZERO,
|
| 312 |
GGML_OP_SOFT_MAX,
|
| 313 |
GGML_OP_ROPE,
|
| 314 |
+
GGML_OP_ROPE_BACK,
|
| 315 |
GGML_OP_ALIBI,
|
| 316 |
GGML_OP_CONV_1D_1S,
|
| 317 |
GGML_OP_CONV_1D_2S,
|
|
|
|
| 340 |
|
| 341 |
// n-dimensional tensor
|
| 342 |
struct ggml_tensor {
|
| 343 |
+
enum ggml_type type;
|
| 344 |
+
enum ggml_backend backend;
|
| 345 |
|
| 346 |
int n_dims;
|
| 347 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
|
|
|
| 372 |
|
| 373 |
char name[32];
|
| 374 |
|
| 375 |
+
char padding[16];
|
| 376 |
};
|
| 377 |
|
| 378 |
// computation graph
|
|
|
|
| 516 |
struct ggml_tensor * a,
|
| 517 |
struct ggml_tensor * b);
|
| 518 |
|
| 519 |
+
GGML_API struct ggml_tensor * ggml_add1(
|
| 520 |
+
struct ggml_context * ctx,
|
| 521 |
+
struct ggml_tensor * a,
|
| 522 |
+
struct ggml_tensor * b);
|
| 523 |
+
|
| 524 |
+
GGML_API struct ggml_tensor * ggml_acc(
|
| 525 |
+
struct ggml_context * ctx,
|
| 526 |
+
struct ggml_tensor * a,
|
| 527 |
+
struct ggml_tensor * b,
|
| 528 |
+
size_t nb1,
|
| 529 |
+
size_t nb2,
|
| 530 |
+
size_t nb3,
|
| 531 |
+
size_t offset);
|
| 532 |
+
|
| 533 |
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
| 534 |
+
struct ggml_context * ctx,
|
| 535 |
+
struct ggml_tensor * a,
|
| 536 |
+
struct ggml_tensor * b,
|
| 537 |
+
size_t nb1,
|
| 538 |
+
size_t nb2,
|
| 539 |
+
size_t nb3,
|
| 540 |
+
size_t offset);
|
| 541 |
+
|
| 542 |
GGML_API struct ggml_tensor * ggml_sub(
|
| 543 |
struct ggml_context * ctx,
|
| 544 |
struct ggml_tensor * a,
|
|
|
|
| 562 |
struct ggml_context * ctx,
|
| 563 |
struct ggml_tensor * a);
|
| 564 |
|
| 565 |
+
GGML_API struct ggml_tensor * ggml_log(
|
| 566 |
+
struct ggml_context * ctx,
|
| 567 |
+
struct ggml_tensor * a);
|
| 568 |
+
|
| 569 |
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
| 570 |
+
struct ggml_context * ctx,
|
| 571 |
+
struct ggml_tensor * a);
|
| 572 |
+
|
| 573 |
// return scalar
|
|
|
|
| 574 |
GGML_API struct ggml_tensor * ggml_sum(
|
| 575 |
struct ggml_context * ctx,
|
| 576 |
struct ggml_tensor * a);
|
| 577 |
|
| 578 |
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
| 579 |
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
| 580 |
+
struct ggml_context * ctx,
|
| 581 |
+
struct ggml_tensor * a);
|
| 582 |
+
|
| 583 |
// mean along rows
|
| 584 |
GGML_API struct ggml_tensor * ggml_mean(
|
| 585 |
struct ggml_context * ctx,
|
|
|
|
| 621 |
struct ggml_context * ctx,
|
| 622 |
struct ggml_tensor * a);
|
| 623 |
|
| 624 |
+
// a - x
|
| 625 |
+
// b - dy
|
| 626 |
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
| 627 |
+
struct ggml_context * ctx,
|
| 628 |
+
struct ggml_tensor * a,
|
| 629 |
+
struct ggml_tensor * b);
|
| 630 |
+
|
| 631 |
// normalize along rows
|
| 632 |
// TODO: eps is hardcoded to 1e-5 for now
|
| 633 |
GGML_API struct ggml_tensor * ggml_norm(
|
|
|
|
| 638 |
struct ggml_context * ctx,
|
| 639 |
struct ggml_tensor * a);
|
| 640 |
|
| 641 |
+
// a - x
|
| 642 |
+
// b - dy
|
| 643 |
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
| 644 |
+
struct ggml_context * ctx,
|
| 645 |
+
struct ggml_tensor * a,
|
| 646 |
+
struct ggml_tensor * b);
|
| 647 |
+
|
| 648 |
// A: m rows, n columns
|
| 649 |
// B: p rows, n columns (i.e. we transpose it internally)
|
| 650 |
// result is m columns, p rows
|
|
|
|
| 657 |
// operations on tensors without backpropagation
|
| 658 |
//
|
| 659 |
|
|
|
|
| 660 |
GGML_API struct ggml_tensor * ggml_scale(
|
| 661 |
struct ggml_context * ctx,
|
| 662 |
struct ggml_tensor * a,
|
| 663 |
struct ggml_tensor * b);
|
| 664 |
|
| 665 |
+
// in-place, returns view(a)
|
| 666 |
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
| 667 |
+
struct ggml_context * ctx,
|
| 668 |
+
struct ggml_tensor * a,
|
| 669 |
+
struct ggml_tensor * b);
|
| 670 |
+
|
| 671 |
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
| 672 |
+
GGML_API struct ggml_tensor * ggml_set(
|
| 673 |
+
struct ggml_context * ctx,
|
| 674 |
+
struct ggml_tensor * a,
|
| 675 |
+
struct ggml_tensor * b,
|
| 676 |
+
size_t nb1,
|
| 677 |
+
size_t nb2,
|
| 678 |
+
size_t nb3,
|
| 679 |
+
size_t offset);
|
| 680 |
+
|
| 681 |
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
| 682 |
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
| 683 |
+
struct ggml_context * ctx,
|
| 684 |
+
struct ggml_tensor * a,
|
| 685 |
+
struct ggml_tensor * b,
|
| 686 |
+
size_t nb1,
|
| 687 |
+
size_t nb2,
|
| 688 |
+
size_t nb3,
|
| 689 |
+
size_t offset);
|
| 690 |
+
|
| 691 |
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
| 692 |
+
struct ggml_context * ctx,
|
| 693 |
+
struct ggml_tensor * a,
|
| 694 |
+
struct ggml_tensor * b,
|
| 695 |
+
size_t offset);
|
| 696 |
+
|
| 697 |
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
| 698 |
+
struct ggml_context * ctx,
|
| 699 |
+
struct ggml_tensor * a,
|
| 700 |
+
struct ggml_tensor * b,
|
| 701 |
+
size_t offset);
|
| 702 |
+
|
| 703 |
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
| 704 |
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
| 705 |
+
struct ggml_context * ctx,
|
| 706 |
+
struct ggml_tensor * a,
|
| 707 |
+
struct ggml_tensor * b,
|
| 708 |
+
size_t nb1,
|
| 709 |
+
size_t offset);
|
| 710 |
+
|
| 711 |
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
| 712 |
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
| 713 |
+
struct ggml_context * ctx,
|
| 714 |
+
struct ggml_tensor * a,
|
| 715 |
+
struct ggml_tensor * b,
|
| 716 |
+
size_t nb1,
|
| 717 |
+
size_t offset);
|
| 718 |
+
|
| 719 |
+
|
| 720 |
// a -> b, return view(b)
|
| 721 |
GGML_API struct ggml_tensor * ggml_cpy(
|
| 722 |
struct ggml_context * ctx,
|
|
|
|
| 737 |
|
| 738 |
// return view(a)
|
| 739 |
// TODO: when we start computing gradient, make a copy instead of view
|
| 740 |
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
| 741 |
+
struct ggml_context * ctx,
|
| 742 |
+
struct ggml_tensor * a,
|
| 743 |
+
int64_t ne0);
|
| 744 |
+
|
| 745 |
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
| 746 |
struct ggml_context * ctx,
|
| 747 |
struct ggml_tensor * a,
|
|
|
|
| 757 |
int64_t ne1,
|
| 758 |
int64_t ne2);
|
| 759 |
|
| 760 |
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
| 761 |
+
struct ggml_context * ctx,
|
| 762 |
+
struct ggml_tensor * a,
|
| 763 |
+
int64_t ne0,
|
| 764 |
+
int64_t ne1,
|
| 765 |
+
int64_t ne2,
|
| 766 |
+
int64_t ne3);
|
| 767 |
+
|
| 768 |
// offset in bytes
|
| 769 |
GGML_API struct ggml_tensor * ggml_view_1d(
|
| 770 |
struct ggml_context * ctx,
|
|
|
|
| 790 |
size_t nb2, // slice stride in bytes
|
| 791 |
size_t offset);
|
| 792 |
|
| 793 |
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
| 794 |
+
struct ggml_context * ctx,
|
| 795 |
+
struct ggml_tensor * a,
|
| 796 |
+
int64_t ne0,
|
| 797 |
+
int64_t ne1,
|
| 798 |
+
int64_t ne2,
|
| 799 |
+
int64_t ne3,
|
| 800 |
+
size_t nb1, // row stride in bytes
|
| 801 |
+
size_t nb2, // slice stride in bytes
|
| 802 |
+
size_t nb3,
|
| 803 |
+
size_t offset);
|
| 804 |
+
|
| 805 |
GGML_API struct ggml_tensor * ggml_permute(
|
| 806 |
struct ggml_context * ctx,
|
| 807 |
struct ggml_tensor * a,
|
|
|
|
| 820 |
struct ggml_tensor * a,
|
| 821 |
struct ggml_tensor * b);
|
| 822 |
|
| 823 |
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
| 824 |
+
struct ggml_context * ctx,
|
| 825 |
+
struct ggml_tensor * a,
|
| 826 |
+
struct ggml_tensor * b,
|
| 827 |
+
struct ggml_tensor * c);
|
| 828 |
+
|
| 829 |
+
GGML_API struct ggml_tensor * ggml_diag(
|
| 830 |
+
struct ggml_context * ctx,
|
| 831 |
+
struct ggml_tensor * a);
|
| 832 |
+
|
| 833 |
// set elements above the diagonal to -INF
|
|
|
|
| 834 |
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
| 835 |
struct ggml_context * ctx,
|
| 836 |
struct ggml_tensor * a,
|
| 837 |
int n_past);
|
| 838 |
|
| 839 |
// in-place, returns view(a)
|
| 840 |
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
| 841 |
+
struct ggml_context * ctx,
|
| 842 |
+
struct ggml_tensor * a,
|
| 843 |
+
int n_past);
|
| 844 |
+
|
| 845 |
+
// set elements above the diagonal to 0
|
| 846 |
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
| 847 |
+
struct ggml_context * ctx,
|
| 848 |
+
struct ggml_tensor * a,
|
| 849 |
+
int n_past);
|
| 850 |
+
|
| 851 |
+
// in-place, returns view(a)
|
| 852 |
+
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
| 853 |
+
struct ggml_context * ctx,
|
| 854 |
+
struct ggml_tensor * a,
|
| 855 |
+
int n_past);
|
| 856 |
+
|
| 857 |
GGML_API struct ggml_tensor * ggml_soft_max(
|
| 858 |
struct ggml_context * ctx,
|
| 859 |
struct ggml_tensor * a);
|
| 860 |
|
|
|
|
| 861 |
// in-place, returns view(a)
|
| 862 |
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
| 863 |
+
struct ggml_context * ctx,
|
| 864 |
+
struct ggml_tensor * a);
|
| 865 |
+
|
| 866 |
+
// rotary position embedding
|
| 867 |
// if mode & 1 == 1, skip n_past elements
|
| 868 |
// if mode & 2 == 1, GPT-NeoX style
|
| 869 |
// TODO: avoid creating a new tensor every time
|
|
|
|
| 874 |
int n_dims,
|
| 875 |
int mode);
|
| 876 |
|
| 877 |
+
// in-place, returns view(a)
|
| 878 |
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
| 879 |
+
struct ggml_context * ctx,
|
| 880 |
+
struct ggml_tensor * a,
|
| 881 |
+
int n_past,
|
| 882 |
+
int n_dims,
|
| 883 |
+
int mode);
|
| 884 |
+
|
| 885 |
+
// rotary position embedding backward, i.e compute dx from dy
|
| 886 |
+
// a - dy
|
| 887 |
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
| 888 |
+
struct ggml_context * ctx,
|
| 889 |
+
struct ggml_tensor * a,
|
| 890 |
+
int n_past,
|
| 891 |
+
int n_dims,
|
| 892 |
+
int mode);
|
| 893 |
+
|
| 894 |
// alibi position embedding
|
| 895 |
// in-place, returns view(a)
|
| 896 |
struct ggml_tensor * ggml_alibi(
|
|
|
|
| 935 |
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
| 936 |
struct ggml_context * ctx,
|
| 937 |
struct ggml_tensor * a,
|
| 938 |
+
ggml_unary_op_f32_t fun);
|
| 939 |
|
| 940 |
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
| 941 |
struct ggml_context * ctx,
|
| 942 |
struct ggml_tensor * a,
|
| 943 |
struct ggml_tensor * b,
|
| 944 |
+
ggml_binary_op_f32_t fun);
|
| 945 |
|
| 946 |
//
|
| 947 |
// automatic differentiation
|
|
|
|
| 1070 |
|
| 1071 |
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 1072 |
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
|
|
| 1073 |
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 1074 |
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 1075 |
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
whisper.cpp
CHANGED
|
@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
| 291 |
{ MODEL_LARGE, 1124ull*MB },
|
| 292 |
},
|
| 293 |
},
|
| 294 |
-
{ GGML_TYPE_Q4_2,
|
| 295 |
-
{
|
| 296 |
-
{ MODEL_TINY, 26ull*MB },
|
| 297 |
-
{ MODEL_BASE, 50ull*MB },
|
| 298 |
-
{ MODEL_SMALL, 154ull*MB },
|
| 299 |
-
{ MODEL_MEDIUM, 470ull*MB },
|
| 300 |
-
{ MODEL_LARGE, 940ull*MB },
|
| 301 |
-
},
|
| 302 |
-
},
|
| 303 |
{ GGML_TYPE_Q5_0,
|
| 304 |
{
|
| 305 |
{ MODEL_TINY, 30ull*MB },
|
|
@@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 861 |
model.type = e_model::MODEL_LARGE;
|
| 862 |
}
|
| 863 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 864 |
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
| 865 |
// in order to save memory and also to speed up the computation
|
| 866 |
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
|
@@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 882 |
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 883 |
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 884 |
fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
|
|
|
|
| 885 |
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
| 886 |
|
| 887 |
// print memory requirements
|
|
@@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1106 |
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
|
| 1107 |
}
|
| 1108 |
|
| 1109 |
-
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*
|
| 1110 |
|
| 1111 |
fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
| 1112 |
}
|
|
@@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
|
|
| 1554 |
Qcur),
|
| 1555 |
Qcur);
|
| 1556 |
|
| 1557 |
-
//Qcur =
|
| 1558 |
|
| 1559 |
// note: no bias for Key
|
| 1560 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
| 1561 |
layer.attn_k_w,
|
| 1562 |
cur);
|
| 1563 |
|
| 1564 |
-
//Kcur =
|
| 1565 |
|
| 1566 |
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
| 1567 |
layer.attn_v_w,
|
|
@@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
|
|
| 1621 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 1622 |
|
| 1623 |
struct ggml_tensor * KQ_scaled =
|
| 1624 |
-
|
| 1625 |
KQ,
|
| 1626 |
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 1627 |
);
|
| 1628 |
|
| 1629 |
-
struct ggml_tensor * KQ_soft_max =
|
| 1630 |
|
| 1631 |
struct ggml_tensor * V =
|
| 1632 |
ggml_cpy(ctx0,
|
|
@@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
|
|
| 1809 |
layer.cross_attn_k_w,
|
| 1810 |
cur);
|
| 1811 |
|
| 1812 |
-
Kcross =
|
| 1813 |
|
| 1814 |
wstate.use_buf(ctx0, 1);
|
| 1815 |
|
|
@@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
|
|
| 1956 |
Qcur),
|
| 1957 |
Qcur);
|
| 1958 |
|
| 1959 |
-
Qcur =
|
| 1960 |
|
| 1961 |
// note: no bias for Key
|
| 1962 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
| 1963 |
layer.attn_k_w,
|
| 1964 |
cur);
|
| 1965 |
|
| 1966 |
-
Kcur =
|
| 1967 |
|
| 1968 |
// store key and value to memory
|
| 1969 |
{
|
|
@@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
|
|
| 2012 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 2013 |
|
| 2014 |
//struct ggml_tensor * KQ_scaled =
|
| 2015 |
-
//
|
| 2016 |
// KQ,
|
| 2017 |
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 2018 |
// );
|
| 2019 |
|
| 2020 |
-
struct ggml_tensor * KQ_masked =
|
| 2021 |
|
| 2022 |
-
struct ggml_tensor * KQ_soft_max =
|
| 2023 |
|
| 2024 |
struct ggml_tensor * V =
|
| 2025 |
ggml_view_3d(ctx0, kv_self.v,
|
|
@@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
|
|
| 2083 |
Qcur),
|
| 2084 |
Qcur);
|
| 2085 |
|
| 2086 |
-
Qcur =
|
| 2087 |
|
| 2088 |
// Kcross is already scaled
|
| 2089 |
struct ggml_tensor * Kcross =
|
|
@@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
|
|
| 2123 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 2124 |
|
| 2125 |
//struct ggml_tensor * KQ_scaled =
|
| 2126 |
-
//
|
| 2127 |
// KQ,
|
| 2128 |
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 2129 |
// );
|
| 2130 |
|
| 2131 |
// no masking for cross-attention
|
| 2132 |
-
//struct ggml_tensor * KQ_masked =
|
| 2133 |
|
| 2134 |
-
struct ggml_tensor * KQ_soft_max =
|
| 2135 |
|
| 2136 |
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
| 2137 |
|
|
@@ -4903,7 +4899,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4903 |
// b: N*N*sizeof(float)
|
| 4904 |
// c: N*N*sizeof(float)
|
| 4905 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 4906 |
-
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*
|
| 4907 |
|
| 4908 |
// put a bunch of random data in the buffer
|
| 4909 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
@@ -4911,7 +4907,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4911 |
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 4912 |
int n_q4_0 = 0;
|
| 4913 |
int n_q4_1 = 0;
|
| 4914 |
-
int n_q4_2 = 0;
|
| 4915 |
int n_q5_0 = 0;
|
| 4916 |
int n_q5_1 = 0;
|
| 4917 |
int n_q8_0 = 0;
|
|
@@ -4921,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4921 |
// GFLOPS/s
|
| 4922 |
double s_q4_0 = 0.0;
|
| 4923 |
double s_q4_1 = 0.0;
|
| 4924 |
-
double s_q4_2 = 0.0;
|
| 4925 |
double s_q5_0 = 0.0;
|
| 4926 |
double s_q5_1 = 0.0;
|
| 4927 |
double s_q8_0 = 0.0;
|
|
@@ -4930,18 +4924,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4930 |
|
| 4931 |
const size_t N = sizes[j];
|
| 4932 |
|
| 4933 |
-
for (int k = 0; k <
|
| 4934 |
const ggml_type wtype =
|
| 4935 |
k == 0 ? GGML_TYPE_Q4_0 :
|
| 4936 |
k == 1 ? GGML_TYPE_Q4_1 :
|
| 4937 |
-
k == 2 ?
|
| 4938 |
-
k == 3 ?
|
| 4939 |
-
k == 4 ?
|
| 4940 |
-
k == 5 ?
|
| 4941 |
-
k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 4942 |
|
| 4943 |
-
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ?
|
| 4944 |
-
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ?
|
| 4945 |
|
| 4946 |
struct ggml_init_params gparams = {
|
| 4947 |
/*.mem_size =*/ buf.size(),
|
|
@@ -4985,9 +4978,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4985 |
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 4986 |
}
|
| 4987 |
|
| 4988 |
-
// Q4_0 | Q4_1
|
| 4989 |
-
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)
|
| 4990 |
-
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1
|
| 4991 |
s += strbuf;
|
| 4992 |
|
| 4993 |
// Q5_0 | Q5_1 | Q8_0
|
|
|
|
| 291 |
{ MODEL_LARGE, 1124ull*MB },
|
| 292 |
},
|
| 293 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
{ GGML_TYPE_Q5_0,
|
| 295 |
{
|
| 296 |
{ MODEL_TINY, 30ull*MB },
|
|
|
|
| 852 |
model.type = e_model::MODEL_LARGE;
|
| 853 |
}
|
| 854 |
|
| 855 |
+
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
| 856 |
+
|
| 857 |
+
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
| 858 |
+
|
| 859 |
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
| 860 |
// in order to save memory and also to speed up the computation
|
| 861 |
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
|
|
|
| 877 |
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 878 |
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 879 |
fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
|
| 880 |
+
fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
|
| 881 |
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
| 882 |
|
| 883 |
// print memory requirements
|
|
|
|
| 1102 |
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
|
| 1103 |
}
|
| 1104 |
|
| 1105 |
+
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
|
| 1106 |
|
| 1107 |
fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
| 1108 |
}
|
|
|
|
| 1550 |
Qcur),
|
| 1551 |
Qcur);
|
| 1552 |
|
| 1553 |
+
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1554 |
|
| 1555 |
// note: no bias for Key
|
| 1556 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
| 1557 |
layer.attn_k_w,
|
| 1558 |
cur);
|
| 1559 |
|
| 1560 |
+
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1561 |
|
| 1562 |
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
| 1563 |
layer.attn_v_w,
|
|
|
|
| 1617 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 1618 |
|
| 1619 |
struct ggml_tensor * KQ_scaled =
|
| 1620 |
+
ggml_scale_inplace(ctx0,
|
| 1621 |
KQ,
|
| 1622 |
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 1623 |
);
|
| 1624 |
|
| 1625 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
|
| 1626 |
|
| 1627 |
struct ggml_tensor * V =
|
| 1628 |
ggml_cpy(ctx0,
|
|
|
|
| 1805 |
layer.cross_attn_k_w,
|
| 1806 |
cur);
|
| 1807 |
|
| 1808 |
+
Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
|
| 1809 |
|
| 1810 |
wstate.use_buf(ctx0, 1);
|
| 1811 |
|
|
|
|
| 1952 |
Qcur),
|
| 1953 |
Qcur);
|
| 1954 |
|
| 1955 |
+
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1956 |
|
| 1957 |
// note: no bias for Key
|
| 1958 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
| 1959 |
layer.attn_k_w,
|
| 1960 |
cur);
|
| 1961 |
|
| 1962 |
+
Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1963 |
|
| 1964 |
// store key and value to memory
|
| 1965 |
{
|
|
|
|
| 2008 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 2009 |
|
| 2010 |
//struct ggml_tensor * KQ_scaled =
|
| 2011 |
+
// ggml_scale_inplace(ctx0,
|
| 2012 |
// KQ,
|
| 2013 |
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 2014 |
// );
|
| 2015 |
|
| 2016 |
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
|
| 2017 |
|
| 2018 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
| 2019 |
|
| 2020 |
struct ggml_tensor * V =
|
| 2021 |
ggml_view_3d(ctx0, kv_self.v,
|
|
|
|
| 2079 |
Qcur),
|
| 2080 |
Qcur);
|
| 2081 |
|
| 2082 |
+
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 2083 |
|
| 2084 |
// Kcross is already scaled
|
| 2085 |
struct ggml_tensor * Kcross =
|
|
|
|
| 2119 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 2120 |
|
| 2121 |
//struct ggml_tensor * KQ_scaled =
|
| 2122 |
+
// ggml_scale_inplace(ctx0,
|
| 2123 |
// KQ,
|
| 2124 |
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 2125 |
// );
|
| 2126 |
|
| 2127 |
// no masking for cross-attention
|
| 2128 |
+
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
| 2129 |
|
| 2130 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
|
| 2131 |
|
| 2132 |
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
| 2133 |
|
|
|
|
| 4899 |
// b: N*N*sizeof(float)
|
| 4900 |
// c: N*N*sizeof(float)
|
| 4901 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 4902 |
+
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
|
| 4903 |
|
| 4904 |
// put a bunch of random data in the buffer
|
| 4905 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
|
|
| 4907 |
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 4908 |
int n_q4_0 = 0;
|
| 4909 |
int n_q4_1 = 0;
|
|
|
|
| 4910 |
int n_q5_0 = 0;
|
| 4911 |
int n_q5_1 = 0;
|
| 4912 |
int n_q8_0 = 0;
|
|
|
|
| 4916 |
// GFLOPS/s
|
| 4917 |
double s_q4_0 = 0.0;
|
| 4918 |
double s_q4_1 = 0.0;
|
|
|
|
| 4919 |
double s_q5_0 = 0.0;
|
| 4920 |
double s_q5_1 = 0.0;
|
| 4921 |
double s_q8_0 = 0.0;
|
|
|
|
| 4924 |
|
| 4925 |
const size_t N = sizes[j];
|
| 4926 |
|
| 4927 |
+
for (int k = 0; k < 7; ++k) {
|
| 4928 |
const ggml_type wtype =
|
| 4929 |
k == 0 ? GGML_TYPE_Q4_0 :
|
| 4930 |
k == 1 ? GGML_TYPE_Q4_1 :
|
| 4931 |
+
k == 2 ? GGML_TYPE_Q5_0 :
|
| 4932 |
+
k == 3 ? GGML_TYPE_Q5_1 :
|
| 4933 |
+
k == 4 ? GGML_TYPE_Q8_0 :
|
| 4934 |
+
k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
| 4935 |
|
| 4936 |
+
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
|
| 4937 |
+
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
|
| 4938 |
|
| 4939 |
struct ggml_init_params gparams = {
|
| 4940 |
/*.mem_size =*/ buf.size(),
|
|
|
|
| 4978 |
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 4979 |
}
|
| 4980 |
|
| 4981 |
+
// Q4_0 | Q4_1
|
| 4982 |
+
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
|
| 4983 |
+
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
|
| 4984 |
s += strbuf;
|
| 4985 |
|
| 4986 |
// Q5_0 | Q5_1 | Q8_0
|