ggerganov commited on
Commit
803e1be
·
unverified ·
1 Parent(s): 4ceb1ab

ggml : sync latest ggml

Browse files

- New Q4 and Q5 formats
- Various improvements

Files changed (10) hide show
  1. examples/common-ggml.cpp +0 -6
  2. examples/common.cpp +34 -0
  3. examples/common.h +3 -0
  4. examples/quantize/quantize.cpp +10 -4
  5. ggml-cuda.cu +291 -109
  6. ggml-cuda.h +2 -0
  7. ggml-opencl.c +85 -122
  8. ggml.c +0 -0
  9. ggml.h +205 -12
  10. whisper.cpp +31 -38
examples/common-ggml.cpp CHANGED
@@ -6,7 +6,6 @@
6
  static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
7
  {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
8
  {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
9
- {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
10
  {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
11
  {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
12
  {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
46
  switch (ftype) {
47
  case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
48
  case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
49
- case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
50
  case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
51
  case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
52
  case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
171
  {
172
  cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
173
  } break;
174
- case GGML_TYPE_Q4_2:
175
- {
176
- cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
177
- } break;
178
  case GGML_TYPE_Q5_0:
179
  {
180
  cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
 
6
  static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
7
  {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
8
  {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
 
9
  {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
10
  {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
11
  {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
 
45
  switch (ftype) {
46
  case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
47
  case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
 
48
  case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
49
  case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
50
  case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
 
169
  {
170
  cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
171
  } break;
 
 
 
 
172
  case GGML_TYPE_Q5_0:
173
  {
174
  cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
examples/common.cpp CHANGED
@@ -38,6 +38,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
38
  } else if (arg == "-h" || arg == "--help") {
39
  gpt_print_usage(argc, argv, params);
40
  exit(0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  } else {
42
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
43
  gpt_print_usage(argc, argv, params);
@@ -57,6 +71,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
57
  fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
58
  fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
59
  fprintf(stderr, " prompt to start generation with (default: random)\n");
 
 
60
  fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
61
  fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
62
  fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
@@ -192,6 +208,10 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
192
  return result;
193
  }
194
 
 
 
 
 
195
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
196
  std::vector<std::string> words;
197
 
@@ -200,6 +220,20 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
200
  std::string str = text;
201
  std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  std::regex re(pat);
204
  std::smatch m;
205
 
 
38
  } else if (arg == "-h" || arg == "--help") {
39
  gpt_print_usage(argc, argv, params);
40
  exit(0);
41
+ } else if (arg == "-f" || arg == "--file") {
42
+ if (++i > argc) {
43
+ fprintf(stderr, "Invalid file param");
44
+ break;
45
+ }
46
+ std::ifstream file(argv[i]);
47
+ if (!file) {
48
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
49
+ break;
50
+ }
51
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
52
+ if (params.prompt.back() == '\n') {
53
+ params.prompt.pop_back();
54
+ }
55
  } else {
56
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
57
  gpt_print_usage(argc, argv, params);
 
71
  fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
72
  fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
73
  fprintf(stderr, " prompt to start generation with (default: random)\n");
74
+ fprintf(stderr, " -f FNAME, --file FNAME\n");
75
+ fprintf(stderr, " load prompt from a file\n");
76
  fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
77
  fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
78
  fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
 
208
  return result;
209
  }
210
 
211
+ void gpt_vocab::add_special_token(const std::string & token) {
212
+ special_tokens.push_back(token);
213
+ }
214
+
215
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
216
  std::vector<std::string> words;
217
 
 
220
  std::string str = text;
221
  std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
222
 
223
+ // Generate the subpattern from the special_tokens vector if it's not empty
224
+ if (!vocab.special_tokens.empty()) {
225
+ std::string special_tokens_subpattern;
226
+ for (const auto & token : vocab.special_tokens) {
227
+ if (!special_tokens_subpattern.empty()) {
228
+ special_tokens_subpattern += "|";
229
+ }
230
+ special_tokens_subpattern += token;
231
+ }
232
+
233
+ // Modify the regex pattern with the generated special tokens subpattern
234
+ pat = special_tokens_subpattern + "|" + pat;
235
+ }
236
+
237
  std::regex re(pat);
238
  std::smatch m;
239
 
examples/common.h CHANGED
@@ -53,6 +53,9 @@ struct gpt_vocab {
53
 
54
  std::map<token, id> token_to_id;
55
  std::map<id, token> id_to_token;
 
 
 
56
  };
57
 
58
  // poor-man's JSON parsing
 
53
 
54
  std::map<token, id> token_to_id;
55
  std::map<id, token> id_to_token;
56
+ std::vector<std::string> special_tokens;
57
+
58
+ void add_special_token(const std::string & token);
59
  };
60
 
61
  // poor-man's JSON parsing
examples/quantize/quantize.cpp CHANGED
@@ -25,7 +25,7 @@ struct whisper_hparams {
25
  int32_t n_text_head = 6;
26
  int32_t n_text_layer = 4;
27
  int32_t n_mels = 80;
28
- int32_t f16 = 1;
29
  };
30
 
31
  struct whisper_filters {
@@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
79
  finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
80
  finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
81
  finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
82
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
 
 
 
83
 
84
  fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
85
  fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
@@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
91
  fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
92
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
93
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
94
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
 
 
 
95
 
96
  fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
97
  fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
@@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
103
  fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
104
  fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
105
  fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
106
- fout.write((char *) &ftype, sizeof(hparams.f16));
107
  }
108
 
109
  // load mel filters
 
25
  int32_t n_text_head = 6;
26
  int32_t n_text_layer = 4;
27
  int32_t n_mels = 80;
28
+ int32_t ftype = 1;
29
  };
30
 
31
  struct whisper_filters {
 
79
  finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
80
  finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
81
  finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
82
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
83
+
84
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
85
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
86
 
87
  fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
88
  fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
 
94
  fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
95
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
96
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
97
+ fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype);
98
+ fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src);
99
+ fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
100
+ fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
101
 
102
  fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
103
  fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
 
109
  fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
110
  fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
111
  fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
112
+ fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
113
  }
114
 
115
  // load mel filters
ggml-cuda.cu CHANGED
@@ -32,9 +32,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
32
  } \
33
  } while (0)
34
 
 
35
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
 
 
 
 
36
 
37
  #define QK4_0 32
 
38
  typedef struct {
39
  float d; // delta
40
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -42,6 +48,7 @@ typedef struct {
42
  static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
43
 
44
  #define QK4_1 32
 
45
  typedef struct {
46
  float d; // delta
47
  float m; // min
@@ -49,14 +56,8 @@ typedef struct {
49
  } block_q4_1;
50
  static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
51
 
52
- #define QK4_2 16
53
- typedef struct {
54
- half d; // delta
55
- uint8_t qs[QK4_2 / 2]; // nibbles / quants
56
- } block_q4_2;
57
- static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
58
-
59
  #define QK5_0 32
 
60
  typedef struct {
61
  half d; // delta
62
  uint8_t qh[4]; // 5-th bit of quants
@@ -65,6 +66,7 @@ typedef struct {
65
  static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
66
 
67
  #define QK5_1 32
 
68
  typedef struct {
69
  half d; // delta
70
  half m; // min
@@ -74,112 +76,164 @@ typedef struct {
74
  static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
75
 
76
  #define QK8_0 32
 
77
  typedef struct {
78
  float d; // delta
79
  int8_t qs[QK8_0]; // quants
80
  } block_q8_0;
81
  static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
82
 
83
- static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
 
 
84
  const block_q4_0 * x = (const block_q4_0 *) vx;
85
 
86
- const int i = blockIdx.x;
87
 
88
- const float d = x[i].d;
 
 
 
89
 
90
- const uint8_t * pp = x[i].qs;
 
 
 
 
 
91
 
92
- for (int l = 0; l < QK4_0; l += 2) {
93
- const uint8_t vi = pp[l/2];
94
 
95
- const int8_t vi0 = vi & 0xf;
96
- const int8_t vi1 = vi >> 4;
97
 
98
- const float v0 = (vi0 - 8)*d;
99
- const float v1 = (vi1 - 8)*d;
100
 
101
- y[i*QK4_0 + l + 0] = v0;
102
- y[i*QK4_0 + l + 1] = v1;
103
- }
104
  }
105
 
106
- static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
107
- const block_q4_1 * x = (const block_q4_1 *) vx;
108
 
109
- const int i = blockIdx.x;
110
 
111
- const float d = x[i].d;
112
- const float m = x[i].m;
 
 
 
113
 
114
- const uint8_t * pp = x[i].qs;
 
115
 
116
- for (int l = 0; l < QK4_1; l += 2) {
117
- const uint8_t vi = pp[l/2];
 
118
 
119
- const int8_t vi0 = vi & 0xf;
120
- const int8_t vi1 = vi >> 4;
121
 
122
- const float v0 = vi0*d + m;
123
- const float v1 = vi1*d + m;
124
 
125
- y[i*QK4_1 + l + 0] = v0;
126
- y[i*QK4_1 + l + 1] = v1;
127
- }
 
 
 
 
 
 
 
 
128
  }
129
 
130
- static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
131
- const block_q4_2 * x = (const block_q4_2 *) vx;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  const int i = blockIdx.x;
134
 
135
  const float d = x[i].d;
136
 
137
- const uint8_t * pp = x[i].qs;
 
 
138
 
139
- for (int l = 0; l < QK4_2; l += 2) {
140
- const uint8_t vi = pp[l/2];
 
 
 
 
 
141
 
142
- const int8_t vi0 = vi & 0xf;
143
- const int8_t vi1 = vi >> 4;
144
 
145
- const float v0 = (vi0 - 8)*d;
146
- const float v1 = (vi1 - 8)*d;
147
 
148
- y[i*QK4_2 + l + 0] = v0;
149
- y[i*QK4_2 + l + 1] = v1;
 
 
 
 
 
 
 
150
  }
151
  }
152
 
153
  static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
 
 
154
  const block_q5_0 * x = (const block_q5_0 *) vx;
155
 
156
  const int i = blockIdx.x;
157
 
158
  const float d = x[i].d;
159
 
160
- const uint8_t * pp = x[i].qs;
161
-
162
  uint32_t qh;
163
  memcpy(&qh, x[i].qh, sizeof(qh));
164
 
165
- for (int l = 0; l < QK5_0; l += 2) {
166
- const uint8_t vi = pp[l/2];
 
167
 
168
- const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
169
- const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
170
 
171
- const int8_t vi0 = ((vi & 0xf) | vh0);
172
- const int8_t vi1 = ((vi >> 4) | vh1);
173
-
174
- const float v0 = (vi0 - 16)*d;
175
- const float v1 = (vi1 - 16)*d;
176
-
177
- y[i*QK5_0 + l + 0] = v0;
178
- y[i*QK5_0 + l + 1] = v1;
179
  }
180
  }
181
 
182
  static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
 
 
183
  const block_q5_1 * x = (const block_q5_1 *) vx;
184
 
185
  const int i = blockIdx.x;
@@ -187,41 +241,70 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
187
  const float d = x[i].d;
188
  const float m = x[i].m;
189
 
190
- const uint8_t * pp = x[i].qs;
191
-
192
  uint32_t qh;
193
  memcpy(&qh, x[i].qh, sizeof(qh));
194
 
195
- for (int l = 0; l < QK5_1; l += 2) {
196
- const uint8_t vi = pp[l/2];
 
197
 
198
- const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
199
- const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
200
 
201
- const int8_t vi0 = (vi & 0xf) | vh0;
202
- const int8_t vi1 = (vi >> 4) | vh1;
203
-
204
- const float v0 = vi0*d + m;
205
- const float v1 = vi1*d + m;
206
-
207
- y[i*QK5_1 + l + 0] = v0;
208
- y[i*QK5_1 + l + 1] = v1;
209
  }
210
  }
211
 
212
  static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
 
 
213
  const block_q8_0 * x = (const block_q8_0 *) vx;
214
 
215
  const int i = blockIdx.x;
216
 
217
  const float d = x[i].d;
218
 
219
- const int8_t * pp = x[i].qs;
 
 
 
 
 
 
 
 
 
 
220
 
221
- for (int l = 0; l < QK8_0; l++) {
222
- const int8_t vi = pp[l];
223
 
224
- y[i*QK8_0 + l] = vi*d;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  }
226
  }
227
 
@@ -235,11 +318,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
235
  dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
236
  }
237
 
238
- static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
239
- const int nb = k / QK4_2;
240
- dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
241
- }
242
-
243
  static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
244
  const int nb = k / QK5_0;
245
  dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -255,6 +333,36 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
255
  dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
256
  }
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  // TODO: optimize
259
  static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
260
  const half * x = (const half *) vx;
@@ -268,14 +376,18 @@ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStre
268
  convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
269
  }
270
 
 
 
 
 
 
 
271
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
272
  switch (type) {
273
  case GGML_TYPE_Q4_0:
274
  return dequantize_row_q4_0_cuda;
275
  case GGML_TYPE_Q4_1:
276
  return dequantize_row_q4_1_cuda;
277
- case GGML_TYPE_Q4_2:
278
- return dequantize_row_q4_2_cuda;
279
  case GGML_TYPE_Q5_0:
280
  return dequantize_row_q5_0_cuda;
281
  case GGML_TYPE_Q5_1:
@@ -289,8 +401,27 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
289
  }
290
  }
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  // buffer pool for cuda
293
- #define MAX_CUDA_BUFFERS 16
294
 
295
  struct scoped_spin_lock {
296
  std::atomic_flag& lock;
@@ -348,7 +479,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
348
  CUDA_CHECK(cudaFree(ptr));
349
  }
350
 
351
- #define GGML_CUDA_MAX_STREAMS 8
352
  #define GGML_CUDA_MAX_EVENTS 64
353
  static cublasHandle_t g_cublasH = nullptr;
354
  static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
@@ -587,6 +718,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
587
  const int nb2 = dst->nb[2];
588
  const int nb3 = dst->nb[3];
589
  const ggml_type type = src0->type;
 
590
 
591
  const float alpha = 1.0f;
592
  const float beta = 0.0f;
@@ -597,12 +729,16 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
597
  const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
598
 
599
  size_t x_size, y_size, d_size, q_size;
600
- float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
 
 
 
601
  float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
602
  float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
603
  char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
604
 
605
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
 
606
  GGML_ASSERT(to_fp32_cuda != nullptr);
607
 
608
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -612,31 +748,54 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
612
  cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
613
  cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
614
 
615
- float * c_X = d_X + i * x_ne;
616
  float * c_Y = d_Y + i * y_ne;
617
  float * c_D = d_D + i * d_ne;
618
  char * c_Q = d_Q + i * q_sz;
619
 
620
- // copy src0 and convert to fp32 on device
621
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
622
- to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
623
- CUDA_CHECK(cudaGetLastError());
624
- CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
 
 
 
 
 
625
 
626
- // copy src1 to device
627
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
628
 
629
- // wait for conversion
630
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
631
 
632
- // compute
633
- CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
634
- CUBLAS_CHECK(
635
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
636
- ne01, ne11, ne10,
637
- &alpha, c_X, ne00,
638
- c_Y, ne10,
639
- &beta, c_D, ne01));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
 
641
  // copy dst to host
642
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
@@ -645,7 +804,9 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
645
  }
646
 
647
  CUDA_CHECK(cudaDeviceSynchronize());
648
- ggml_cuda_pool_free(d_X, x_size);
 
 
649
  ggml_cuda_pool_free(d_Y, y_size);
650
  ggml_cuda_pool_free(d_D, d_size);
651
  ggml_cuda_pool_free(d_Q, q_size);
@@ -661,8 +822,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
661
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
662
  src1->type == GGML_TYPE_F32 &&
663
  dst->type == GGML_TYPE_F32 &&
664
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
665
-
666
  return true;
667
  }
668
 
@@ -714,3 +874,25 @@ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct
714
  return 0;
715
  }
716
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  } \
33
  } while (0)
34
 
35
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
36
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
37
+ typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
38
+
39
+ // QK = number of values after dequantization
40
+ // QR = QK / number of values before dequantization
41
 
42
  #define QK4_0 32
43
+ #define QR4_0 2
44
  typedef struct {
45
  float d; // delta
46
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
 
48
  static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
49
 
50
  #define QK4_1 32
51
+ #define QR4_1 2
52
  typedef struct {
53
  float d; // delta
54
  float m; // min
 
56
  } block_q4_1;
57
  static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
58
 
 
 
 
 
 
 
 
59
  #define QK5_0 32
60
+ #define QR5_0 2
61
  typedef struct {
62
  half d; // delta
63
  uint8_t qh[4]; // 5-th bit of quants
 
66
  static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
67
 
68
  #define QK5_1 32
69
+ #define QR5_1 2
70
  typedef struct {
71
  half d; // delta
72
  half m; // min
 
76
  static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
77
 
78
  #define QK8_0 32
79
+ #define QR8_0 1
80
  typedef struct {
81
  float d; // delta
82
  int8_t qs[QK8_0]; // quants
83
  } block_q8_0;
84
  static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
85
 
86
+ #define CUDA_DMMV_BLOCK_SIZE 32
87
+
88
+ static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
89
  const block_q4_0 * x = (const block_q4_0 *) vx;
90
 
91
+ const float d = x[ib].d;
92
 
93
+ const uint8_t vui = x[ib].qs[iqs];
94
+
95
+ const int8_t vi0 = vui & 0xF;
96
+ const int8_t vi1 = vui >> 4;
97
 
98
+ v0 = (vi0 - 8)*d;
99
+ v1 = (vi1 - 8)*d;
100
+ }
101
+
102
+ static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
103
+ const block_q4_1 * x = (const block_q4_1 *) vx;
104
 
105
+ const float d = x[ib].d;
106
+ const float m = x[ib].m;
107
 
108
+ const uint8_t vui = x[ib].qs[iqs];
 
109
 
110
+ const int8_t vi0 = vui & 0xF;
111
+ const int8_t vi1 = vui >> 4;
112
 
113
+ v0 = vi0*d + m;
114
+ v1 = vi1*d + m;
 
115
  }
116
 
117
+ static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
118
+ const block_q5_0 * x = (const block_q5_0 *) vx;
119
 
120
+ const float d = x[ib].d;
121
 
122
+ uint32_t qh;
123
+ memcpy(&qh, x[ib].qh, sizeof(qh));
124
+
125
+ const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
126
+ const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
127
 
128
+ const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
129
+ const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
130
 
131
+ v0 = x0*d;
132
+ v1 = x1*d;
133
+ }
134
 
135
+ static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
136
+ const block_q5_1 * x = (const block_q5_1 *) vx;
137
 
138
+ const float d = x[ib].d;
139
+ const float m = x[ib].m;
140
 
141
+ uint32_t qh;
142
+ memcpy(&qh, x[ib].qh, sizeof(qh));
143
+
144
+ const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
145
+ const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
146
+
147
+ const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
148
+ const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
149
+
150
+ v0 = x0*d + m;
151
+ v1 = x1*d + m;
152
  }
153
 
154
+ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
155
+ const block_q8_0 * x = (const block_q8_0 *) vx;
156
+
157
+ const float d = x[ib].d;
158
+
159
+ const int8_t vi0 = x[ib].qs[iqs + 0];
160
+ const int8_t vi1 = x[ib].qs[iqs + 1];
161
+
162
+ v0 = vi0*d;
163
+ v1 = vi1*d;
164
+ }
165
+
166
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
167
+ const half * x = (const half *) vx;
168
+
169
+ v0 = __half2float(x[ib + 0]);
170
+ v1 = __half2float(x[ib + 1]);
171
+ }
172
+
173
+ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
174
+ static const int qk = QK4_0;
175
+
176
+ const block_q4_0 * x = (const block_q4_0 *) vx;
177
 
178
  const int i = blockIdx.x;
179
 
180
  const float d = x[i].d;
181
 
182
+ for (int j = 0; j < qk/2; ++j) {
183
+ const int x0 = (x[i].qs[j] & 0xf) - 8;
184
+ const int x1 = (x[i].qs[j] >> 4) - 8;
185
 
186
+ y[i*qk + j + 0 ] = x0*d;
187
+ y[i*qk + j + qk/2] = x1*d;
188
+ }
189
+ }
190
+
191
+ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
192
+ static const int qk = QK4_1;
193
 
194
+ const block_q4_1 * x = (const block_q4_1 *) vx;
 
195
 
196
+ const int i = blockIdx.x;
 
197
 
198
+ const float d = x[i].d;
199
+ const float m = x[i].m;
200
+
201
+ for (int j = 0; j < qk/2; ++j) {
202
+ const int x0 = (x[i].qs[j] & 0xf);
203
+ const int x1 = (x[i].qs[j] >> 4);
204
+
205
+ y[i*qk + j + 0 ] = x0*d + m;
206
+ y[i*qk + j + qk/2] = x1*d + m;
207
  }
208
  }
209
 
210
  static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
211
+ static const int qk = QK5_0;
212
+
213
  const block_q5_0 * x = (const block_q5_0 *) vx;
214
 
215
  const int i = blockIdx.x;
216
 
217
  const float d = x[i].d;
218
 
 
 
219
  uint32_t qh;
220
  memcpy(&qh, x[i].qh, sizeof(qh));
221
 
222
+ for (int j = 0; j < qk/2; ++j) {
223
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
224
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
225
 
226
+ const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
227
+ const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
228
 
229
+ y[i*qk + j + 0 ] = x0*d;
230
+ y[i*qk + j + qk/2] = x1*d;
 
 
 
 
 
 
231
  }
232
  }
233
 
234
  static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
235
+ static const int qk = QK5_1;
236
+
237
  const block_q5_1 * x = (const block_q5_1 *) vx;
238
 
239
  const int i = blockIdx.x;
 
241
  const float d = x[i].d;
242
  const float m = x[i].m;
243
 
 
 
244
  uint32_t qh;
245
  memcpy(&qh, x[i].qh, sizeof(qh));
246
 
247
+ for (int j = 0; j < qk/2; ++j) {
248
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
249
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
250
 
251
+ const int x0 = (x[i].qs[j] & 0xf) | xh_0;
252
+ const int x1 = (x[i].qs[j] >> 4) | xh_1;
253
 
254
+ y[i*qk + j + 0 ] = x0*d + m;
255
+ y[i*qk + j + qk/2] = x1*d + m;
 
 
 
 
 
 
256
  }
257
  }
258
 
259
  static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
260
+ static const int qk = QK8_0;
261
+
262
  const block_q8_0 * x = (const block_q8_0 *) vx;
263
 
264
  const int i = blockIdx.x;
265
 
266
  const float d = x[i].d;
267
 
268
+ for (int j = 0; j < qk; ++j) {
269
+ y[i*qk + j] = x[i].qs[j]*d;
270
+ }
271
+ }
272
+
273
+ template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
274
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
275
+ const int row = blockIdx.x;
276
+ const int tid = threadIdx.x;
277
+
278
+ const int y_offset = qr == 1 ? 1 : qk/2;
279
 
280
+ __shared__ float tmp[block_size]; // separate sum for each thread
281
+ tmp[tid] = 0;
282
 
283
+ for (int i = 0; i < ncols/block_size; i += 2) {
284
+ const int col = i*block_size + 2*tid;
285
+ const int ib = (row*ncols + col)/qk; // block index
286
+ const int iqs = (col%qk)/qr; // quant index
287
+ const int iybs = col - col%qk; // y block start index
288
+
289
+ // dequantize
290
+ float v0, v1;
291
+ dequantize_kernel(vx, ib, iqs, v0, v1);
292
+
293
+ // matrix multiplication
294
+ tmp[tid] += v0 * y[iybs + iqs + 0];
295
+ tmp[tid] += v1 * y[iybs + iqs + y_offset];
296
+ }
297
+
298
+ // sum up partial sums and write back result
299
+ __syncthreads();
300
+ for (int s=block_size/2; s>0; s>>=1) {
301
+ if (tid < s) {
302
+ tmp[tid] += tmp[tid + s];
303
+ }
304
+ __syncthreads();
305
+ }
306
+ if (tid == 0) {
307
+ dst[row] = tmp[0];
308
  }
309
  }
310
 
 
318
  dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
319
  }
320
 
 
 
 
 
 
321
  static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
322
  const int nb = k / QK5_0;
323
  dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
 
333
  dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
334
  }
335
 
336
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
337
+ GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
338
+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
339
+ <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
340
+ }
341
+
342
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
343
+ GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
344
+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
345
+ <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
346
+ }
347
+
348
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
349
+ GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
350
+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
351
+ <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
352
+ }
353
+
354
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
355
+ GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
356
+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
357
+ <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
358
+ }
359
+
360
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
361
+ GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
362
+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
363
+ <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
364
+ }
365
+
366
  // TODO: optimize
367
  static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
368
  const half * x = (const half *) vx;
 
376
  convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
377
  }
378
 
379
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
380
+ GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
381
+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
382
+ <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
383
+ }
384
+
385
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
386
  switch (type) {
387
  case GGML_TYPE_Q4_0:
388
  return dequantize_row_q4_0_cuda;
389
  case GGML_TYPE_Q4_1:
390
  return dequantize_row_q4_1_cuda;
 
 
391
  case GGML_TYPE_Q5_0:
392
  return dequantize_row_q5_0_cuda;
393
  case GGML_TYPE_Q5_1:
 
401
  }
402
  }
403
 
404
+ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
405
+ switch (type) {
406
+ case GGML_TYPE_Q4_0:
407
+ return dequantize_mul_mat_vec_q4_0_cuda;
408
+ case GGML_TYPE_Q4_1:
409
+ return dequantize_mul_mat_vec_q4_1_cuda;
410
+ case GGML_TYPE_Q5_0:
411
+ return dequantize_mul_mat_vec_q5_0_cuda;
412
+ case GGML_TYPE_Q5_1:
413
+ return dequantize_mul_mat_vec_q5_1_cuda;
414
+ case GGML_TYPE_Q8_0:
415
+ return dequantize_mul_mat_vec_q8_0_cuda;
416
+ case GGML_TYPE_F16:
417
+ return convert_mul_mat_vec_f16_cuda;
418
+ default:
419
+ return nullptr;
420
+ }
421
+ }
422
+
423
  // buffer pool for cuda
424
+ #define MAX_CUDA_BUFFERS 256
425
 
426
  struct scoped_spin_lock {
427
  std::atomic_flag& lock;
 
479
  CUDA_CHECK(cudaFree(ptr));
480
  }
481
 
482
+ #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
483
  #define GGML_CUDA_MAX_EVENTS 64
484
  static cublasHandle_t g_cublasH = nullptr;
485
  static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
 
718
  const int nb2 = dst->nb[2];
719
  const int nb3 = dst->nb[3];
720
  const ggml_type type = src0->type;
721
+ const bool mul_mat_vec = ne11 == 1;
722
 
723
  const float alpha = 1.0f;
724
  const float beta = 0.0f;
 
729
  const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
730
 
731
  size_t x_size, y_size, d_size, q_size;
732
+ float * d_X = nullptr;
733
+ if (!mul_mat_vec) {
734
+ d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
735
+ }
736
  float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
737
  float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
738
  char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
739
 
740
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
741
+ dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
742
  GGML_ASSERT(to_fp32_cuda != nullptr);
743
 
744
  for (int64_t i03 = 0; i03 < ne03; i03++) {
 
748
  cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
749
  cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
750
 
 
751
  float * c_Y = d_Y + i * y_ne;
752
  float * c_D = d_D + i * d_ne;
753
  char * c_Q = d_Q + i * q_sz;
754
 
755
+ // copy src0 to device if necessary
756
+ if (src0->backend == GGML_BACKEND_CPU) {
757
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
758
+ } else if (src0->backend == GGML_BACKEND_CUDA) {
759
+ c_Q = ((char *) src0->data) + i * q_sz;
760
+ } else {
761
+ GGML_ASSERT(false);
762
+ }
763
+ if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
764
+ CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
765
 
766
+ // copy src1 to device
767
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
768
 
769
+ // wait for data
770
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
771
 
772
+ // compute
773
+ dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
774
+ CUDA_CHECK(cudaGetLastError());
775
+
776
+ } else { // general dequantization kernel + cuBLAS matrix matrix multiplication
777
+ float * c_X = d_X + i * x_ne;
778
+
779
+ // convert src0 to fp32 on device
780
+ to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
781
+ CUDA_CHECK(cudaGetLastError());
782
+ CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
783
+
784
+ // copy src1 to device
785
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
786
+
787
+ // wait for conversion
788
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
789
+
790
+ // compute
791
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
792
+ CUBLAS_CHECK(
793
+ cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
794
+ ne01, ne11, ne10,
795
+ &alpha, c_X, ne00,
796
+ c_Y, ne10,
797
+ &beta, c_D, ne01));
798
+ }
799
 
800
  // copy dst to host
801
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
804
  }
805
 
806
  CUDA_CHECK(cudaDeviceSynchronize());
807
+ if (!mul_mat_vec) {
808
+ ggml_cuda_pool_free(d_X, x_size);
809
+ }
810
  ggml_cuda_pool_free(d_Y, y_size);
811
  ggml_cuda_pool_free(d_D, d_size);
812
  ggml_cuda_pool_free(d_Q, q_size);
 
822
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
823
  src1->type == GGML_TYPE_F32 &&
824
  dst->type == GGML_TYPE_F32 &&
825
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
 
826
  return true;
827
  }
828
 
 
874
  return 0;
875
  }
876
  }
877
+
878
+ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
879
+ const int64_t ne0 = tensor->ne[0];
880
+ const int64_t ne1 = tensor->ne[1];
881
+ const int64_t ne2 = tensor->ne[2];
882
+ const int64_t ne3 = tensor->ne[3];
883
+
884
+ const ggml_type type = tensor->type;
885
+ const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
886
+
887
+ size_t q_size;
888
+ char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
889
+
890
+ cudaStream_t cudaStream2 = g_cudaStreams2[0];
891
+
892
+ // copy tensor to device
893
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
894
+ CUDA_CHECK(cudaDeviceSynchronize());
895
+
896
+ tensor->data = d_Q;
897
+ tensor->backend = GGML_BACKEND_CUDA;
898
+ }
ggml-cuda.h CHANGED
@@ -14,6 +14,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
14
  void * ggml_cuda_host_malloc(size_t size);
15
  void ggml_cuda_host_free(void * ptr);
16
 
 
 
17
  #ifdef __cplusplus
18
  }
19
  #endif
 
14
  void * ggml_cuda_host_malloc(size_t size);
15
  void ggml_cuda_host_free(void * ptr);
16
 
17
+ void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
18
+
19
  #ifdef __cplusplus
20
  }
21
  #endif
ggml-opencl.c CHANGED
@@ -12,129 +12,129 @@
12
  #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
  const char * clblast_dequant = MULTILINE_QUOTE(
14
 
 
 
 
 
 
15
  struct block_q4_0
16
  {
17
  float d;
18
- uchar qs[16];
19
  };
20
 
21
- __kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
22
- const uint i = get_global_id(0) / 32;
23
- const uint l = get_local_id(0);
24
-
25
- const float d = blocks[i].d;
 
 
26
 
27
- const uchar vi = blocks[i].qs[l];
 
 
 
 
 
 
28
 
29
- const uint index = i*32 + l*2;
30
- result[index + 0] = ((vi & 0xf) - 8)*d;
31
- result[index + 1] = ((vi >> 4) - 8)*d;
32
- }
 
 
 
 
33
 
34
- struct block_q4_1
 
35
  {
36
  float d;
37
- float m;
38
- uchar qs[16];
39
  };
40
 
41
- __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
42
- const uint i = get_global_id(0) / 32;
43
- const uint l = get_local_id(0);
44
 
45
- const float d = blocks[i].d;
46
- const float m = blocks[i].m;
 
 
 
 
 
47
 
48
- const uchar vi = blocks[i].qs[l];
 
49
 
50
- const uint index = i*32 + l*2;
51
- result[index + 0] = (vi & 0xf) * d + m;
52
- result[index + 1] = (vi >> 4) * d + m;
53
  }
54
 
55
- struct block_q4_2
56
- {
57
- ushort d;
58
- uchar qs[8];
59
- };
60
 
61
- __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
62
- const uint i = get_global_id(0) / 16;
63
- const uint l = get_local_id(0);
64
 
65
- const float d = vload_half(0, (__global half*) &blocks[i].d);
 
66
 
67
- const uchar vi = blocks[i].qs[l];
 
68
 
69
- const uint index = i*16 + l*2;
70
- result[index + 0] = ((vi & 0xf) - 8)*d;
71
- result[index + 1] = ((vi >> 4) - 8)*d;
72
  }
73
 
 
 
74
 
75
- struct block_q5_0
76
- {
77
- float d;
78
- uint qh;
79
- uchar qs[16];
80
- };
81
 
82
- __kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
83
- const uint i = get_global_id(0) / 32;
84
- const uint l = get_local_id(0);
85
 
86
- const float d = blocks[i].d;
87
 
88
- const uchar vi = blocks[i].qs[l];
 
89
 
90
- const uint l2 = l * 2;
 
91
 
92
- const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
93
- const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
94
-
95
- const uint index = i*32 + l2;
96
- result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
97
- result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
98
  }
99
 
100
- struct block_q5_1
101
- {
102
- ushort d;
103
- ushort m;
104
- uint qh;
105
- uchar qs[16];
106
- };
107
 
108
- __kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
109
- const uint i = get_global_id(0) / 32;
110
- const uint l = get_local_id(0);
111
 
112
- const float d = vload_half(0, (__global half*) &blocks[i].d);
113
- const float m = vload_half(0, (__global half*) &blocks[i].m);
114
 
115
- const uchar vi = blocks[i].qs[l];
116
 
117
- const uint l2 = l * 2;
 
118
 
119
- const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
120
- const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
121
 
122
- const uint index = i*32 + l2;
123
- result[index + 0] = ((vi & 0xf) | vh0)*d + m;
124
- result[index + 1] = ((vi >> 4) | vh1)*d + m;
125
  }
126
 
127
- struct block_q8_0
128
- {
129
- float d;
130
- char qs[32];
131
- };
132
-
133
- __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
134
- const uint i = get_global_id(0) / 32;
135
- const uint l = get_local_id(0);
136
 
137
- result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
 
138
  }
139
 
140
  );
@@ -148,26 +148,12 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
148
  } \
149
  } while (0)
150
 
151
- #define QK5_0 32
152
- typedef struct {
153
- ggml_fp16_t d; // delta
154
- uint8_t qh[4]; // 5-th bit of quants
155
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
156
- } block_q5_0;
157
-
158
-
159
- typedef struct {
160
- float d; // delta
161
- uint32_t qh; // 5-th bit of quants
162
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
163
- } cl_block_q5_0;
164
-
165
  static cl_platform_id platform;
166
  static cl_device_id device;
167
  static cl_context context;
168
  static cl_command_queue queue;
169
  static cl_program program;
170
- static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
171
  static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
172
  static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
173
 
@@ -238,8 +224,6 @@ void ggml_cl_init(void) {
238
  CL_CHECK(err, "clCreateKernel");
239
  kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
240
  CL_CHECK(err, "clCreateKernel");
241
- kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
242
- CL_CHECK(err, "clCreateKernel");
243
  kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
244
  CL_CHECK(err, "clCreateKernel");
245
  kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@@ -274,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
274
  cl_kernel kernel;
275
  size_t global = n * k, local, size_qb;
276
  bool dequant;
277
- cl_block_q5_0* cl_host_b;
278
 
279
  switch (btype) {
280
  case GGML_TYPE_F32:
@@ -292,28 +275,11 @@ void ggml_cl_sgemm_wrapper(
292
  local = 16;
293
  size_qb = global * (sizeof(float) * 2 + local) / 32;
294
  break;
295
- case GGML_TYPE_Q4_2:
296
- dequant = true;
297
- kernel = kernel_q4_2;
298
- local = 8;
299
- size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
300
- break;
301
  case GGML_TYPE_Q5_0:
302
  dequant = true;
303
  kernel = kernel_q5_0;
304
  local = 16;
305
- // For some reason OpenCL seems to be incapable of working with structs of size 22.
306
- // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
307
- // TODO Find the reason, fix and remove workaround.
308
- const block_q5_0* b = (const block_q5_0*) host_b;
309
- cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
310
- for (size_t i = 0; i < global / 32; i++) {
311
- cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
312
- memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
313
- memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
314
- }
315
- host_b = (const float*) cl_host_b;
316
- size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
317
  break;
318
  case GGML_TYPE_Q5_1:
319
  dequant = true;
@@ -392,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
392
  clWaitForEvents(1, &ev_c);
393
  clReleaseEvent(ev_sgemm);
394
  clReleaseEvent(ev_c);
395
- if (btype == GGML_TYPE_Q5_0) {
396
- free((void*) cl_host_b);
397
- }
398
  }
 
12
  #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
  const char * clblast_dequant = MULTILINE_QUOTE(
14
 
15
+ typedef uchar uint8_t;
16
+ typedef int int32_t;
17
+ typedef uint uint32_t;
18
+
19
+ constant uint QK4_0 = 32;
20
  struct block_q4_0
21
  {
22
  float d;
23
+ uint8_t qs[QK4_0 / 2];
24
  };
25
 
26
+ constant uint QK4_1 = 32;
27
+ struct block_q4_1
28
+ {
29
+ float d;
30
+ float m;
31
+ uint8_t qs[QK4_1 / 2];
32
+ };
33
 
34
+ constant uint QK5_0 = 32;
35
+ struct __attribute__ ((packed)) block_q5_0
36
+ {
37
+ half d;
38
+ uint32_t qh;
39
+ uint8_t qs[QK5_0 / 2];
40
+ };
41
 
42
+ constant uint QK5_1 = 32;
43
+ struct block_q5_1
44
+ {
45
+ half d;
46
+ half m;
47
+ uint32_t qh;
48
+ uint8_t qs[QK5_1 / 2];
49
+ };
50
 
51
+ constant uint QK8_0 = 32;
52
+ struct block_q8_0
53
  {
54
  float d;
55
+ uint8_t qs[QK8_0];
 
56
  };
57
 
 
 
 
58
 
59
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
60
+ constant uint qk = QK4_0;
61
+
62
+ const uint i = get_global_id(0) / qk;
63
+ const uint j = get_local_id(0);
64
+
65
+ const float d = x[i].d;
66
 
67
+ const int x0 = (x[i].qs[j] & 0xf) - 8;
68
+ const int x1 = (x[i].qs[j] >> 4) - 8;
69
 
70
+ y[i*qk + j + 0 ] = x0*d;
71
+ y[i*qk + j + qk/2] = x1*d;
 
72
  }
73
 
74
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
75
+ constant uint qk = QK4_1;
 
 
 
76
 
77
+ const uint i = get_global_id(0) / qk;
78
+ const uint j = get_local_id(0);
 
79
 
80
+ const float d = x[i].d;
81
+ const float m = x[i].m;
82
 
83
+ const int x0 = (x[i].qs[j] & 0xf);
84
+ const int x1 = (x[i].qs[j] >> 4);
85
 
86
+ y[i*qk + j + 0 ] = x0*d + m;
87
+ y[i*qk + j + qk/2] = x1*d + m;
 
88
  }
89
 
90
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
91
+ constant uint qk = QK5_0;
92
 
93
+ const uint i = get_global_id(0) / qk;
94
+ const uint j = get_local_id(0);
 
 
 
 
95
 
96
+ const float d = vload_half(0, (__global half*) &x[i].d);
 
 
97
 
98
+ uint32_t qh = x[i].qh;
99
 
100
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
101
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
102
 
103
+ const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
104
+ const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
105
 
106
+ y[i*qk + j + 0 ] = x0*d;
107
+ y[i*qk + j + qk/2] = x1*d;
 
 
 
 
108
  }
109
 
110
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
111
+ constant uint qk = QK5_1;
 
 
 
 
 
112
 
113
+ const uint i = get_global_id(0) / qk;
114
+ const uint j = get_local_id(0);
 
115
 
116
+ const float d = vload_half(0, (__global half*) &x[i].d);
117
+ const float m = vload_half(0, (__global half*) &x[i].m);
118
 
119
+ uint32_t qh = x[i].qh;
120
 
121
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
122
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
123
 
124
+ const int x0 = (x[i].qs[j] & 0xf) | xh_0;
125
+ const int x1 = (x[i].qs[j] >> 4) | xh_1;
126
 
127
+ y[i*qk + j + 0 ] = x0*d + m;
128
+ y[i*qk + j + qk/2] = x1*d + m;
 
129
  }
130
 
131
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
132
+ constant uint qk = QK8_0;
133
+ const uint i = get_global_id(0) / qk;
134
+ const uint j = get_local_id(0);
 
 
 
 
 
135
 
136
+ const float d = x[i].d;
137
+ y[i*qk + j] = x[i].qs[j]*d;
138
  }
139
 
140
  );
 
148
  } \
149
  } while (0)
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  static cl_platform_id platform;
152
  static cl_device_id device;
153
  static cl_context context;
154
  static cl_command_queue queue;
155
  static cl_program program;
156
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
157
  static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
158
  static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
159
 
 
224
  CL_CHECK(err, "clCreateKernel");
225
  kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
226
  CL_CHECK(err, "clCreateKernel");
 
 
227
  kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
228
  CL_CHECK(err, "clCreateKernel");
229
  kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
 
258
  cl_kernel kernel;
259
  size_t global = n * k, local, size_qb;
260
  bool dequant;
 
261
 
262
  switch (btype) {
263
  case GGML_TYPE_F32:
 
275
  local = 16;
276
  size_qb = global * (sizeof(float) * 2 + local) / 32;
277
  break;
 
 
 
 
 
 
278
  case GGML_TYPE_Q5_0:
279
  dequant = true;
280
  kernel = kernel_q5_0;
281
  local = 16;
282
+ size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
 
 
 
 
 
 
 
 
 
 
 
283
  break;
284
  case GGML_TYPE_Q5_1:
285
  dequant = true;
 
358
  clWaitForEvents(1, &ev_c);
359
  clReleaseEvent(ev_sgemm);
360
  clReleaseEvent(ev_c);
 
 
 
361
  }
ggml.c CHANGED
The diff for this file is too large to render. See raw diff
 
ggml.h CHANGED
@@ -190,9 +190,12 @@
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
  #define GGML_FILE_VERSION 1
192
 
 
 
 
193
  #define GGML_MAX_DIMS 4
194
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
196
  #define GGML_MAX_CONTEXTS 64
197
  #define GGML_MAX_OPT 4
198
  #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
231
  GGML_TYPE_F16 = 1,
232
  GGML_TYPE_Q4_0 = 2,
233
  GGML_TYPE_Q4_1 = 3,
234
- GGML_TYPE_Q4_2 = 4,
235
  // GGML_TYPE_Q4_3 (5) support has been removed
236
  GGML_TYPE_Q5_0 = 6,
237
  GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
243
  GGML_TYPE_COUNT,
244
  };
245
 
 
 
 
 
 
246
  // model file types
247
  enum ggml_ftype {
248
  GGML_FTYPE_UNKNOWN = -1,
@@ -251,7 +259,6 @@ extern "C" {
251
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
- GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
263
 
264
  GGML_OP_DUP,
265
  GGML_OP_ADD,
 
 
266
  GGML_OP_SUB,
267
  GGML_OP_MUL,
268
  GGML_OP_DIV,
269
  GGML_OP_SQR,
270
  GGML_OP_SQRT,
 
271
  GGML_OP_SUM,
 
272
  GGML_OP_MEAN,
273
  GGML_OP_REPEAT,
274
  GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
278
  GGML_OP_RELU,
279
  GGML_OP_GELU,
280
  GGML_OP_SILU,
 
281
  GGML_OP_NORM, // normalize
282
  GGML_OP_RMS_NORM,
 
283
 
284
  GGML_OP_MUL_MAT,
285
 
286
  GGML_OP_SCALE,
 
287
  GGML_OP_CPY,
288
  GGML_OP_CONT,
289
  GGML_OP_RESHAPE,
@@ -291,9 +305,13 @@ extern "C" {
291
  GGML_OP_PERMUTE,
292
  GGML_OP_TRANSPOSE,
293
  GGML_OP_GET_ROWS,
 
 
294
  GGML_OP_DIAG_MASK_INF,
 
295
  GGML_OP_SOFT_MAX,
296
  GGML_OP_ROPE,
 
297
  GGML_OP_ALIBI,
298
  GGML_OP_CONV_1D_1S,
299
  GGML_OP_CONV_1D_2S,
@@ -322,7 +340,8 @@ extern "C" {
322
 
323
  // n-dimensional tensor
324
  struct ggml_tensor {
325
- enum ggml_type type;
 
326
 
327
  int n_dims;
328
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +372,7 @@ extern "C" {
353
 
354
  char name[32];
355
 
356
- char padding[8]; // TODO: remove and add padding to name?
357
  };
358
 
359
  // computation graph
@@ -497,6 +516,29 @@ extern "C" {
497
  struct ggml_tensor * a,
498
  struct ggml_tensor * b);
499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  GGML_API struct ggml_tensor * ggml_sub(
501
  struct ggml_context * ctx,
502
  struct ggml_tensor * a,
@@ -520,12 +562,24 @@ extern "C" {
520
  struct ggml_context * ctx,
521
  struct ggml_tensor * a);
522
 
 
 
 
 
 
 
 
 
523
  // return scalar
524
- // TODO: compute sum along rows
525
  GGML_API struct ggml_tensor * ggml_sum(
526
  struct ggml_context * ctx,
527
  struct ggml_tensor * a);
528
 
 
 
 
 
 
529
  // mean along rows
530
  GGML_API struct ggml_tensor * ggml_mean(
531
  struct ggml_context * ctx,
@@ -567,6 +621,13 @@ extern "C" {
567
  struct ggml_context * ctx,
568
  struct ggml_tensor * a);
569
 
 
 
 
 
 
 
 
570
  // normalize along rows
571
  // TODO: eps is hardcoded to 1e-5 for now
572
  GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +638,13 @@ extern "C" {
577
  struct ggml_context * ctx,
578
  struct ggml_tensor * a);
579
 
 
 
 
 
 
 
 
580
  // A: m rows, n columns
581
  // B: p rows, n columns (i.e. we transpose it internally)
582
  // result is m columns, p rows
@@ -589,12 +657,66 @@ extern "C" {
589
  // operations on tensors without backpropagation
590
  //
591
 
592
- // in-place, returns view(a)
593
  GGML_API struct ggml_tensor * ggml_scale(
594
  struct ggml_context * ctx,
595
  struct ggml_tensor * a,
596
  struct ggml_tensor * b);
597
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
  // a -> b, return view(b)
599
  GGML_API struct ggml_tensor * ggml_cpy(
600
  struct ggml_context * ctx,
@@ -615,6 +737,11 @@ extern "C" {
615
 
616
  // return view(a)
617
  // TODO: when we start computing gradient, make a copy instead of view
 
 
 
 
 
618
  GGML_API struct ggml_tensor * ggml_reshape_2d(
619
  struct ggml_context * ctx,
620
  struct ggml_tensor * a,
@@ -630,6 +757,14 @@ extern "C" {
630
  int64_t ne1,
631
  int64_t ne2);
632
 
 
 
 
 
 
 
 
 
633
  // offset in bytes
634
  GGML_API struct ggml_tensor * ggml_view_1d(
635
  struct ggml_context * ctx,
@@ -655,6 +790,18 @@ extern "C" {
655
  size_t nb2, // slice stride in bytes
656
  size_t offset);
657
 
 
 
 
 
 
 
 
 
 
 
 
 
658
  GGML_API struct ggml_tensor * ggml_permute(
659
  struct ggml_context * ctx,
660
  struct ggml_tensor * a,
@@ -673,20 +820,50 @@ extern "C" {
673
  struct ggml_tensor * a,
674
  struct ggml_tensor * b);
675
 
 
 
 
 
 
 
 
 
 
 
676
  // set elements above the diagonal to -INF
677
- // in-place, returns view(a)
678
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
  struct ggml_context * ctx,
680
  struct ggml_tensor * a,
681
  int n_past);
682
 
683
  // in-place, returns view(a)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
684
  GGML_API struct ggml_tensor * ggml_soft_max(
685
  struct ggml_context * ctx,
686
  struct ggml_tensor * a);
687
 
688
- // rotary position embedding
689
  // in-place, returns view(a)
 
 
 
 
 
690
  // if mode & 1 == 1, skip n_past elements
691
  // if mode & 2 == 1, GPT-NeoX style
692
  // TODO: avoid creating a new tensor every time
@@ -697,6 +874,23 @@ extern "C" {
697
  int n_dims,
698
  int mode);
699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  // alibi position embedding
701
  // in-place, returns view(a)
702
  struct ggml_tensor * ggml_alibi(
@@ -741,13 +935,13 @@ extern "C" {
741
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
  struct ggml_context * ctx,
743
  struct ggml_tensor * a,
744
- const ggml_unary_op_f32_t fun);
745
 
746
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
  struct ggml_context * ctx,
748
  struct ggml_tensor * a,
749
  struct ggml_tensor * b,
750
- const ggml_binary_op_f32_t fun);
751
 
752
  //
753
  // automatic differentiation
@@ -876,7 +1070,6 @@ extern "C" {
876
 
877
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
  #define GGML_FILE_VERSION 1
192
 
193
+ #define GGML_QNT_VERSION 1 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
196
  #define GGML_MAX_DIMS 4
197
  #define GGML_MAX_NODES 4096
198
+ #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
  #define GGML_MAX_OPT 4
201
  #define GGML_DEFAULT_N_THREADS 4
 
234
  GGML_TYPE_F16 = 1,
235
  GGML_TYPE_Q4_0 = 2,
236
  GGML_TYPE_Q4_1 = 3,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
238
  // GGML_TYPE_Q4_3 (5) support has been removed
239
  GGML_TYPE_Q5_0 = 6,
240
  GGML_TYPE_Q5_1 = 7,
 
246
  GGML_TYPE_COUNT,
247
  };
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
254
  // model file types
255
  enum ggml_ftype {
256
  GGML_FTYPE_UNKNOWN = -1,
 
259
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
260
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
261
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
 
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
 
270
 
271
  GGML_OP_DUP,
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
275
  GGML_OP_SUB,
276
  GGML_OP_MUL,
277
  GGML_OP_DIV,
278
  GGML_OP_SQR,
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
283
  GGML_OP_MEAN,
284
  GGML_OP_REPEAT,
285
  GGML_OP_ABS,
 
289
  GGML_OP_RELU,
290
  GGML_OP_GELU,
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
293
  GGML_OP_NORM, // normalize
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
296
 
297
  GGML_OP_MUL_MAT,
298
 
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
301
  GGML_OP_CPY,
302
  GGML_OP_CONT,
303
  GGML_OP_RESHAPE,
 
305
  GGML_OP_PERMUTE,
306
  GGML_OP_TRANSPOSE,
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
312
  GGML_OP_SOFT_MAX,
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
315
  GGML_OP_ALIBI,
316
  GGML_OP_CONV_1D_1S,
317
  GGML_OP_CONV_1D_2S,
 
340
 
341
  // n-dimensional tensor
342
  struct ggml_tensor {
343
+ enum ggml_type type;
344
+ enum ggml_backend backend;
345
 
346
  int n_dims;
347
  int64_t ne[GGML_MAX_DIMS]; // number of elements
 
372
 
373
  char name[32];
374
 
375
+ char padding[16];
376
  };
377
 
378
  // computation graph
 
516
  struct ggml_tensor * a,
517
  struct ggml_tensor * b);
518
 
519
+ GGML_API struct ggml_tensor * ggml_add1(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a,
522
+ struct ggml_tensor * b);
523
+
524
+ GGML_API struct ggml_tensor * ggml_acc(
525
+ struct ggml_context * ctx,
526
+ struct ggml_tensor * a,
527
+ struct ggml_tensor * b,
528
+ size_t nb1,
529
+ size_t nb2,
530
+ size_t nb3,
531
+ size_t offset);
532
+
533
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
534
+ struct ggml_context * ctx,
535
+ struct ggml_tensor * a,
536
+ struct ggml_tensor * b,
537
+ size_t nb1,
538
+ size_t nb2,
539
+ size_t nb3,
540
+ size_t offset);
541
+
542
  GGML_API struct ggml_tensor * ggml_sub(
543
  struct ggml_context * ctx,
544
  struct ggml_tensor * a,
 
562
  struct ggml_context * ctx,
563
  struct ggml_tensor * a);
564
 
565
+ GGML_API struct ggml_tensor * ggml_log(
566
+ struct ggml_context * ctx,
567
+ struct ggml_tensor * a);
568
+
569
+ GGML_API struct ggml_tensor * ggml_log_inplace(
570
+ struct ggml_context * ctx,
571
+ struct ggml_tensor * a);
572
+
573
  // return scalar
 
574
  GGML_API struct ggml_tensor * ggml_sum(
575
  struct ggml_context * ctx,
576
  struct ggml_tensor * a);
577
 
578
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
579
+ GGML_API struct ggml_tensor * ggml_sum_rows(
580
+ struct ggml_context * ctx,
581
+ struct ggml_tensor * a);
582
+
583
  // mean along rows
584
  GGML_API struct ggml_tensor * ggml_mean(
585
  struct ggml_context * ctx,
 
621
  struct ggml_context * ctx,
622
  struct ggml_tensor * a);
623
 
624
+ // a - x
625
+ // b - dy
626
+ GGML_API struct ggml_tensor * ggml_silu_back(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ struct ggml_tensor * b);
630
+
631
  // normalize along rows
632
  // TODO: eps is hardcoded to 1e-5 for now
633
  GGML_API struct ggml_tensor * ggml_norm(
 
638
  struct ggml_context * ctx,
639
  struct ggml_tensor * a);
640
 
641
+ // a - x
642
+ // b - dy
643
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ struct ggml_tensor * b);
647
+
648
  // A: m rows, n columns
649
  // B: p rows, n columns (i.e. we transpose it internally)
650
  // result is m columns, p rows
 
657
  // operations on tensors without backpropagation
658
  //
659
 
 
660
  GGML_API struct ggml_tensor * ggml_scale(
661
  struct ggml_context * ctx,
662
  struct ggml_tensor * a,
663
  struct ggml_tensor * b);
664
 
665
+ // in-place, returns view(a)
666
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
667
+ struct ggml_context * ctx,
668
+ struct ggml_tensor * a,
669
+ struct ggml_tensor * b);
670
+
671
+ // b -> view(a,offset,nb1,nb2,3), return modified a
672
+ GGML_API struct ggml_tensor * ggml_set(
673
+ struct ggml_context * ctx,
674
+ struct ggml_tensor * a,
675
+ struct ggml_tensor * b,
676
+ size_t nb1,
677
+ size_t nb2,
678
+ size_t nb3,
679
+ size_t offset);
680
+
681
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
682
+ GGML_API struct ggml_tensor * ggml_set_inplace(
683
+ struct ggml_context * ctx,
684
+ struct ggml_tensor * a,
685
+ struct ggml_tensor * b,
686
+ size_t nb1,
687
+ size_t nb2,
688
+ size_t nb3,
689
+ size_t offset);
690
+
691
+ GGML_API struct ggml_tensor * ggml_set_1d(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a,
694
+ struct ggml_tensor * b,
695
+ size_t offset);
696
+
697
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
698
+ struct ggml_context * ctx,
699
+ struct ggml_tensor * a,
700
+ struct ggml_tensor * b,
701
+ size_t offset);
702
+
703
+ // b -> view(a,offset,nb1,nb2,3), return modified a
704
+ GGML_API struct ggml_tensor * ggml_set_2d(
705
+ struct ggml_context * ctx,
706
+ struct ggml_tensor * a,
707
+ struct ggml_tensor * b,
708
+ size_t nb1,
709
+ size_t offset);
710
+
711
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
712
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b,
716
+ size_t nb1,
717
+ size_t offset);
718
+
719
+
720
  // a -> b, return view(b)
721
  GGML_API struct ggml_tensor * ggml_cpy(
722
  struct ggml_context * ctx,
 
737
 
738
  // return view(a)
739
  // TODO: when we start computing gradient, make a copy instead of view
740
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a,
743
+ int64_t ne0);
744
+
745
  GGML_API struct ggml_tensor * ggml_reshape_2d(
746
  struct ggml_context * ctx,
747
  struct ggml_tensor * a,
 
757
  int64_t ne1,
758
  int64_t ne2);
759
 
760
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
761
+ struct ggml_context * ctx,
762
+ struct ggml_tensor * a,
763
+ int64_t ne0,
764
+ int64_t ne1,
765
+ int64_t ne2,
766
+ int64_t ne3);
767
+
768
  // offset in bytes
769
  GGML_API struct ggml_tensor * ggml_view_1d(
770
  struct ggml_context * ctx,
 
790
  size_t nb2, // slice stride in bytes
791
  size_t offset);
792
 
793
+ GGML_API struct ggml_tensor * ggml_view_4d(
794
+ struct ggml_context * ctx,
795
+ struct ggml_tensor * a,
796
+ int64_t ne0,
797
+ int64_t ne1,
798
+ int64_t ne2,
799
+ int64_t ne3,
800
+ size_t nb1, // row stride in bytes
801
+ size_t nb2, // slice stride in bytes
802
+ size_t nb3,
803
+ size_t offset);
804
+
805
  GGML_API struct ggml_tensor * ggml_permute(
806
  struct ggml_context * ctx,
807
  struct ggml_tensor * a,
 
820
  struct ggml_tensor * a,
821
  struct ggml_tensor * b);
822
 
823
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
824
+ struct ggml_context * ctx,
825
+ struct ggml_tensor * a,
826
+ struct ggml_tensor * b,
827
+ struct ggml_tensor * c);
828
+
829
+ GGML_API struct ggml_tensor * ggml_diag(
830
+ struct ggml_context * ctx,
831
+ struct ggml_tensor * a);
832
+
833
  // set elements above the diagonal to -INF
 
834
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
835
  struct ggml_context * ctx,
836
  struct ggml_tensor * a,
837
  int n_past);
838
 
839
  // in-place, returns view(a)
840
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
841
+ struct ggml_context * ctx,
842
+ struct ggml_tensor * a,
843
+ int n_past);
844
+
845
+ // set elements above the diagonal to 0
846
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
847
+ struct ggml_context * ctx,
848
+ struct ggml_tensor * a,
849
+ int n_past);
850
+
851
+ // in-place, returns view(a)
852
+ GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ int n_past);
856
+
857
  GGML_API struct ggml_tensor * ggml_soft_max(
858
  struct ggml_context * ctx,
859
  struct ggml_tensor * a);
860
 
 
861
  // in-place, returns view(a)
862
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a);
865
+
866
+ // rotary position embedding
867
  // if mode & 1 == 1, skip n_past elements
868
  // if mode & 2 == 1, GPT-NeoX style
869
  // TODO: avoid creating a new tensor every time
 
874
  int n_dims,
875
  int mode);
876
 
877
+ // in-place, returns view(a)
878
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
879
+ struct ggml_context * ctx,
880
+ struct ggml_tensor * a,
881
+ int n_past,
882
+ int n_dims,
883
+ int mode);
884
+
885
+ // rotary position embedding backward, i.e compute dx from dy
886
+ // a - dy
887
+ GGML_API struct ggml_tensor * ggml_rope_back(
888
+ struct ggml_context * ctx,
889
+ struct ggml_tensor * a,
890
+ int n_past,
891
+ int n_dims,
892
+ int mode);
893
+
894
  // alibi position embedding
895
  // in-place, returns view(a)
896
  struct ggml_tensor * ggml_alibi(
 
935
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
936
  struct ggml_context * ctx,
937
  struct ggml_tensor * a,
938
+ ggml_unary_op_f32_t fun);
939
 
940
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
941
  struct ggml_context * ctx,
942
  struct ggml_tensor * a,
943
  struct ggml_tensor * b,
944
+ ggml_binary_op_f32_t fun);
945
 
946
  //
947
  // automatic differentiation
 
1070
 
1071
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
1072
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
 
1073
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
1074
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
1075
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
whisper.cpp CHANGED
@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
291
  { MODEL_LARGE, 1124ull*MB },
292
  },
293
  },
294
- { GGML_TYPE_Q4_2,
295
- {
296
- { MODEL_TINY, 26ull*MB },
297
- { MODEL_BASE, 50ull*MB },
298
- { MODEL_SMALL, 154ull*MB },
299
- { MODEL_MEDIUM, 470ull*MB },
300
- { MODEL_LARGE, 940ull*MB },
301
- },
302
- },
303
  { GGML_TYPE_Q5_0,
304
  {
305
  { MODEL_TINY, 30ull*MB },
@@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
861
  model.type = e_model::MODEL_LARGE;
862
  }
863
 
 
 
 
 
864
  // for the big tensors, we have the option to store the data in 16-bit floats or quantized
865
  // in order to save memory and also to speed up the computation
866
  wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
@@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
882
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
883
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
884
  fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
 
885
  fprintf(stderr, "%s: type = %d\n", __func__, model.type);
886
 
887
  // print memory requirements
@@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1106
  ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
1107
  }
1108
 
1109
- ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
1110
 
1111
  fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1112
  }
@@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
1554
  Qcur),
1555
  Qcur);
1556
 
1557
- //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1558
 
1559
  // note: no bias for Key
1560
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1561
  layer.attn_k_w,
1562
  cur);
1563
 
1564
- //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1565
 
1566
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1567
  layer.attn_v_w,
@@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
1621
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1622
 
1623
  struct ggml_tensor * KQ_scaled =
1624
- ggml_scale(ctx0,
1625
  KQ,
1626
  ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1627
  );
1628
 
1629
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
1630
 
1631
  struct ggml_tensor * V =
1632
  ggml_cpy(ctx0,
@@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
1809
  layer.cross_attn_k_w,
1810
  cur);
1811
 
1812
- Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1813
 
1814
  wstate.use_buf(ctx0, 1);
1815
 
@@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
1956
  Qcur),
1957
  Qcur);
1958
 
1959
- Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1960
 
1961
  // note: no bias for Key
1962
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1963
  layer.attn_k_w,
1964
  cur);
1965
 
1966
- Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1967
 
1968
  // store key and value to memory
1969
  {
@@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
2012
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2013
 
2014
  //struct ggml_tensor * KQ_scaled =
2015
- // ggml_scale(ctx0,
2016
  // KQ,
2017
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2018
  // );
2019
 
2020
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
2021
 
2022
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2023
 
2024
  struct ggml_tensor * V =
2025
  ggml_view_3d(ctx0, kv_self.v,
@@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
2083
  Qcur),
2084
  Qcur);
2085
 
2086
- Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2087
 
2088
  // Kcross is already scaled
2089
  struct ggml_tensor * Kcross =
@@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
2123
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2124
 
2125
  //struct ggml_tensor * KQ_scaled =
2126
- // ggml_scale(ctx0,
2127
  // KQ,
2128
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2129
  // );
2130
 
2131
  // no masking for cross-attention
2132
- //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
2133
 
2134
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
2135
 
2136
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2137
 
@@ -4903,7 +4899,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4903
  // b: N*N*sizeof(float)
4904
  // c: N*N*sizeof(float)
4905
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
4906
- std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
4907
 
4908
  // put a bunch of random data in the buffer
4909
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -4911,7 +4907,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4911
  for (int j = 0; j < (int) sizes.size(); j++) {
4912
  int n_q4_0 = 0;
4913
  int n_q4_1 = 0;
4914
- int n_q4_2 = 0;
4915
  int n_q5_0 = 0;
4916
  int n_q5_1 = 0;
4917
  int n_q8_0 = 0;
@@ -4921,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4921
  // GFLOPS/s
4922
  double s_q4_0 = 0.0;
4923
  double s_q4_1 = 0.0;
4924
- double s_q4_2 = 0.0;
4925
  double s_q5_0 = 0.0;
4926
  double s_q5_1 = 0.0;
4927
  double s_q8_0 = 0.0;
@@ -4930,18 +4924,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4930
 
4931
  const size_t N = sizes[j];
4932
 
4933
- for (int k = 0; k < 8; ++k) {
4934
  const ggml_type wtype =
4935
  k == 0 ? GGML_TYPE_Q4_0 :
4936
  k == 1 ? GGML_TYPE_Q4_1 :
4937
- k == 2 ? GGML_TYPE_Q4_2 :
4938
- k == 3 ? GGML_TYPE_Q5_0 :
4939
- k == 4 ? GGML_TYPE_Q5_1 :
4940
- k == 5 ? GGML_TYPE_Q8_0 :
4941
- k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
4942
 
4943
- double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
4944
- int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
4945
 
4946
  struct ggml_init_params gparams = {
4947
  /*.mem_size =*/ buf.size(),
@@ -4985,9 +4978,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4985
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4986
  }
4987
 
4988
- // Q4_0 | Q4_1 | Q4_2
4989
- snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
4990
- N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
4991
  s += strbuf;
4992
 
4993
  // Q5_0 | Q5_1 | Q8_0
 
291
  { MODEL_LARGE, 1124ull*MB },
292
  },
293
  },
 
 
 
 
 
 
 
 
 
294
  { GGML_TYPE_Q5_0,
295
  {
296
  { MODEL_TINY, 30ull*MB },
 
852
  model.type = e_model::MODEL_LARGE;
853
  }
854
 
855
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
856
+
857
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
858
+
859
  // for the big tensors, we have the option to store the data in 16-bit floats or quantized
860
  // in order to save memory and also to speed up the computation
861
  wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
 
877
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
878
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
879
  fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
880
+ fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
881
  fprintf(stderr, "%s: type = %d\n", __func__, model.type);
882
 
883
  // print memory requirements
 
1102
  ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
1103
  }
1104
 
1105
+ ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
1106
 
1107
  fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1108
  }
 
1550
  Qcur),
1551
  Qcur);
1552
 
1553
+ //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1554
 
1555
  // note: no bias for Key
1556
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1557
  layer.attn_k_w,
1558
  cur);
1559
 
1560
+ //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1561
 
1562
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1563
  layer.attn_v_w,
 
1617
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1618
 
1619
  struct ggml_tensor * KQ_scaled =
1620
+ ggml_scale_inplace(ctx0,
1621
  KQ,
1622
  ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1623
  );
1624
 
1625
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
1626
 
1627
  struct ggml_tensor * V =
1628
  ggml_cpy(ctx0,
 
1805
  layer.cross_attn_k_w,
1806
  cur);
1807
 
1808
+ Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1809
 
1810
  wstate.use_buf(ctx0, 1);
1811
 
 
1952
  Qcur),
1953
  Qcur);
1954
 
1955
+ Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1956
 
1957
  // note: no bias for Key
1958
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1959
  layer.attn_k_w,
1960
  cur);
1961
 
1962
+ Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1963
 
1964
  // store key and value to memory
1965
  {
 
2008
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2009
 
2010
  //struct ggml_tensor * KQ_scaled =
2011
+ // ggml_scale_inplace(ctx0,
2012
  // KQ,
2013
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2014
  // );
2015
 
2016
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
2017
 
2018
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2019
 
2020
  struct ggml_tensor * V =
2021
  ggml_view_3d(ctx0, kv_self.v,
 
2079
  Qcur),
2080
  Qcur);
2081
 
2082
+ Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2083
 
2084
  // Kcross is already scaled
2085
  struct ggml_tensor * Kcross =
 
2119
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2120
 
2121
  //struct ggml_tensor * KQ_scaled =
2122
+ // ggml_scale_inplace(ctx0,
2123
  // KQ,
2124
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2125
  // );
2126
 
2127
  // no masking for cross-attention
2128
+ //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2129
 
2130
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
2131
 
2132
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2133
 
 
4899
  // b: N*N*sizeof(float)
4900
  // c: N*N*sizeof(float)
4901
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
4902
+ std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
4903
 
4904
  // put a bunch of random data in the buffer
4905
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
 
4907
  for (int j = 0; j < (int) sizes.size(); j++) {
4908
  int n_q4_0 = 0;
4909
  int n_q4_1 = 0;
 
4910
  int n_q5_0 = 0;
4911
  int n_q5_1 = 0;
4912
  int n_q8_0 = 0;
 
4916
  // GFLOPS/s
4917
  double s_q4_0 = 0.0;
4918
  double s_q4_1 = 0.0;
 
4919
  double s_q5_0 = 0.0;
4920
  double s_q5_1 = 0.0;
4921
  double s_q8_0 = 0.0;
 
4924
 
4925
  const size_t N = sizes[j];
4926
 
4927
+ for (int k = 0; k < 7; ++k) {
4928
  const ggml_type wtype =
4929
  k == 0 ? GGML_TYPE_Q4_0 :
4930
  k == 1 ? GGML_TYPE_Q4_1 :
4931
+ k == 2 ? GGML_TYPE_Q5_0 :
4932
+ k == 3 ? GGML_TYPE_Q5_1 :
4933
+ k == 4 ? GGML_TYPE_Q8_0 :
4934
+ k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
4935
 
4936
+ double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
4937
+ int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
4938
 
4939
  struct ggml_init_params gparams = {
4940
  /*.mem_size =*/ buf.size(),
 
4978
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4979
  }
4980
 
4981
+ // Q4_0 | Q4_1
4982
+ snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
4983
+ N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
4984
  s += strbuf;
4985
 
4986
  // Q5_0 | Q5_1 | Q8_0