Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

ggerganov commited on May 14, 2023

Commit

803e1be

unverified ·

1 Parent(s): 4ceb1ab

ggml : sync latest ggml

Browse files

- New Q4 and Q5 formats
- Various improvements

Files changed (10) hide show

examples/common-ggml.cpp +0 -6
examples/common.cpp +34 -0
examples/common.h +3 -0
examples/quantize/quantize.cpp +10 -4
ggml-cuda.cu +291 -109
ggml-cuda.h +2 -0
ggml-opencl.c +85 -122
ggml.c +0 -0
ggml.h +205 -12
whisper.cpp +31 -38

examples/common-ggml.cpp CHANGED Viewed

@@ -6,7 +6,6 @@
 static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
     {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
     {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
-    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
     {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
     {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
     {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
     switch (ftype) {
         case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
         case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
         case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
         case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
         case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
                     {
                         cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
-                case GGML_TYPE_Q4_2:
-                    {
-                        cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
                 case GGML_TYPE_Q5_0:
                     {
                         cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());

 static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
     {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
     {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
     {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
     {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
     {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
     switch (ftype) {
         case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
         case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
         case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
         case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
         case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
                     {
                         cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
                 case GGML_TYPE_Q5_0:
                     {
                         cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());

examples/common.cpp CHANGED Viewed

@@ -38,6 +38,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "-h" || arg == "--help") {
             gpt_print_usage(argc, argv, params);
             exit(0);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             gpt_print_usage(argc, argv, params);
@@ -57,6 +71,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: random)\n");
     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
@@ -192,6 +208,10 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
     return result;
 }
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
     std::vector<std::string> words;
@@ -200,6 +220,20 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
         std::string str = text;
         std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
         std::regex re(pat);
         std::smatch m;

         } else if (arg == "-h" || arg == "--help") {
             gpt_print_usage(argc, argv, params);
             exit(0);
+        } else if (arg == "-f" || arg == "--file") {
+            if (++i > argc) {
+                fprintf(stderr, "Invalid file param");
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             gpt_print_usage(argc, argv, params);
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stderr, "                        load prompt from a file\n");
     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
     return result;
 }
+void gpt_vocab::add_special_token(const std::string & token) {
+    special_tokens.push_back(token);
+}
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
     std::vector<std::string> words;
         std::string str = text;
         std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+        // Generate the subpattern from the special_tokens vector if it's not empty
+        if (!vocab.special_tokens.empty()) {
+            std::string special_tokens_subpattern;
+            for (const auto & token : vocab.special_tokens) {
+                if (!special_tokens_subpattern.empty()) {
+                    special_tokens_subpattern += "|";
+                }
+                special_tokens_subpattern += token;
+            }
+            // Modify the regex pattern with the generated special tokens subpattern
+            pat = special_tokens_subpattern + "|" + pat;
+        }
         std::regex re(pat);
         std::smatch m;

examples/common.h CHANGED Viewed

@@ -53,6 +53,9 @@ struct gpt_vocab {
     std::map<token, id> token_to_id;
     std::map<id, token> id_to_token;
 };
 // poor-man's JSON parsing

     std::map<token, id> token_to_id;
     std::map<id, token> id_to_token;
+    std::vector<std::string> special_tokens;
+    void add_special_token(const std::string & token);
 };
 // poor-man's JSON parsing

examples/quantize/quantize.cpp CHANGED Viewed

@@ -25,7 +25,7 @@ struct whisper_hparams {
     int32_t n_text_head   = 6;
     int32_t n_text_layer  = 4;
     int32_t n_mels        = 80;
-    int32_t f16           = 1;
 };
 struct whisper_filters {
@@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
         finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
         finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
         finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        finp.read((char *) &hparams.f16,           sizeof(hparams.f16));
         fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
@@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
         fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
         fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
         fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
         fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
         fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
@@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
         fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
         fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
         fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        fout.write((char *) &ftype,                 sizeof(hparams.f16));
     }
     // load mel filters

     int32_t n_text_head   = 6;
     int32_t n_text_layer  = 4;
     int32_t n_mels        = 80;
+    int32_t ftype         = 1;
 };
 struct whisper_filters {
         finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
         finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
         finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        finp.read((char *) &hparams.ftype,         sizeof(hparams.ftype));
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
         fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
         fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
         fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
         fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
+        fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
+        fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
+        fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
+        fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, GGML_QNT_VERSION);
         fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
         fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
         fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
         fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
         fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        fout.write((char *) &ftype_dst,             sizeof(hparams.ftype));
     }
     // load mel filters

ggml-cuda.cu CHANGED Viewed

@@ -32,9 +32,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
         }                                                                               \
     } while (0)
 typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
 #define QK4_0 32
 typedef struct {
     float   d;              // delta
     uint8_t qs[QK4_0 / 2];  // nibbles / quants
@@ -42,6 +48,7 @@ typedef struct {
 static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
 #define QK4_1 32
 typedef struct {
     float   d;              // delta
     float   m;              // min
@@ -49,14 +56,8 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-#define QK4_2 16
-typedef struct {
-    half  d;                // delta
-    uint8_t qs[QK4_2 / 2];  // nibbles / quants
-} block_q4_2;
-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 #define QK5_0 32
 typedef struct {
     half d;                 // delta
     uint8_t qh[4];          // 5-th bit of quants
@@ -65,6 +66,7 @@ typedef struct {
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
 #define QK5_1 32
 typedef struct {
     half d;                 // delta
     half m;                 // min
@@ -74,112 +76,164 @@ typedef struct {
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 #define QK8_0 32
 typedef struct {
     float   d;              // delta
     int8_t  qs[QK8_0];      // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
-static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
     const block_q4_0 * x = (const block_q4_0 *) vx;
-    const int i = blockIdx.x;
-    const float d = x[i].d;
-    const uint8_t * pp = x[i].qs;
-    for (int l = 0; l < QK4_0; l += 2) {
-        const uint8_t vi = pp[l/2];
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-        const float v0 = (vi0 - 8)*d;
-        const float v1 = (vi1 - 8)*d;
-        y[i*QK4_0 + l + 0] = v0;
-        y[i*QK4_0 + l + 1] = v1;
-    }
 }
-static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-    const int i = blockIdx.x;
-    const float d = x[i].d;
-    const float m = x[i].m;
-    const uint8_t * pp = x[i].qs;
-    for (int l = 0; l < QK4_1; l += 2) {
-        const uint8_t vi = pp[l/2];
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-        const float v0 = vi0*d + m;
-        const float v1 = vi1*d + m;
-        y[i*QK4_1 + l + 0] = v0;
-        y[i*QK4_1 + l + 1] = v1;
-    }
 }
-static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
-    const block_q4_2 * x = (const block_q4_2 *) vx;
     const int i = blockIdx.x;
     const float d = x[i].d;
-    const uint8_t * pp = x[i].qs;
-    for (int l = 0; l < QK4_2; l += 2) {
-        const uint8_t vi = pp[l/2];
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-        const float v0 = (vi0 - 8)*d;
-        const float v1 = (vi1 - 8)*d;
-        y[i*QK4_2 + l + 0] = v0;
-        y[i*QK4_2 + l + 1] = v1;
     }
 }
 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
     const block_q5_0 * x = (const block_q5_0 *) vx;
     const int i = blockIdx.x;
     const float d = x[i].d;
-    const uint8_t * pp = x[i].qs;
     uint32_t qh;
     memcpy(&qh, x[i].qh, sizeof(qh));
-    for (int l = 0; l < QK5_0; l += 2) {
-        const uint8_t vi = pp[l/2];
-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
-        const int8_t vi0 = ((vi & 0xf) | vh0);
-        const int8_t vi1 = ((vi >>  4) | vh1);
-        const float v0 = (vi0 - 16)*d;
-        const float v1 = (vi1 - 16)*d;
-        y[i*QK5_0 + l + 0] = v0;
-        y[i*QK5_0 + l + 1] = v1;
     }
 }
 static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
     const block_q5_1 * x = (const block_q5_1 *) vx;
     const int i = blockIdx.x;
@@ -187,41 +241,70 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
     const float d = x[i].d;
     const float m = x[i].m;
-    const uint8_t * pp = x[i].qs;
     uint32_t qh;
     memcpy(&qh, x[i].qh, sizeof(qh));
-    for (int l = 0; l < QK5_1; l += 2) {
-        const uint8_t vi = pp[l/2];
-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
-        const int8_t vi0 = (vi & 0xf) | vh0;
-        const int8_t vi1 = (vi >>  4) | vh1;
-        const float v0 = vi0*d + m;
-        const float v1 = vi1*d + m;
-        y[i*QK5_1 + l + 0] = v0;
-        y[i*QK5_1 + l + 1] = v1;
     }
 }
 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
     const block_q8_0 * x = (const block_q8_0 *) vx;
     const int i = blockIdx.x;
     const float d = x[i].d;
-    const int8_t * pp = x[i].qs;
-    for (int l = 0; l < QK8_0; l++) {
-        const int8_t vi = pp[l];
-        y[i*QK8_0 + l] = vi*d;
     }
 }
@@ -235,11 +318,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
     dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
 }
-static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK4_2;
-    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
-}
 static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_0;
     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -255,6 +333,36 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
     dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 // TODO: optimize
 static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
     const half * x = (const half *) vx;
@@ -268,14 +376,18 @@ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStre
     convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
 }
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
             return dequantize_row_q4_0_cuda;
         case GGML_TYPE_Q4_1:
             return dequantize_row_q4_1_cuda;
-        case GGML_TYPE_Q4_2:
-            return dequantize_row_q4_2_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_row_q5_0_cuda;
         case GGML_TYPE_Q5_1:
@@ -289,8 +401,27 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     }
 }
 // buffer pool for cuda
-#define MAX_CUDA_BUFFERS 16
 struct scoped_spin_lock {
     std::atomic_flag& lock;
@@ -348,7 +479,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
     CUDA_CHECK(cudaFree(ptr));
 }
-#define GGML_CUDA_MAX_STREAMS 8
 #define GGML_CUDA_MAX_EVENTS 64
 static cublasHandle_t g_cublasH = nullptr;
 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
@@ -587,6 +718,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
     const int nb2  = dst->nb[2];
     const int nb3  = dst->nb[3];
     const ggml_type type = src0->type;
     const float alpha = 1.0f;
     const float beta = 0.0f;
@@ -597,12 +729,16 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
     const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
     size_t x_size, y_size, d_size, q_size;
-    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
     float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
     float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
     char  * d_Q = (char  *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
     const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
     GGML_ASSERT(to_fp32_cuda != nullptr);
     for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -612,31 +748,54 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
             cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
             cudaEvent_t  cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
-            float * c_X = d_X + i * x_ne;
             float * c_Y = d_Y + i * y_ne;
             float * c_D = d_D + i * d_ne;
             char  * c_Q = d_Q + i * q_sz;
-            // copy src0 and convert to fp32 on device
-            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
-            to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
-            CUDA_CHECK(cudaGetLastError());
-            CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-            // copy src1 to device
-            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
-            // wait for conversion
-            CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-            // compute
-            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
-            CUBLAS_CHECK(
-                cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                        ne01, ne11, ne10,
-                        &alpha, c_X, ne00,
-                                c_Y, ne10,
-                        &beta,  c_D, ne01));
             // copy dst to host
             float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
@@ -645,7 +804,9 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
     }
     CUDA_CHECK(cudaDeviceSynchronize());
-    ggml_cuda_pool_free(d_X, x_size);
     ggml_cuda_pool_free(d_Y, y_size);
     ggml_cuda_pool_free(d_D, d_size);
     ggml_cuda_pool_free(d_Q, q_size);
@@ -661,8 +822,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
         dst->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
         return true;
     }
@@ -714,3 +874,25 @@ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct
         return 0;
     }
 }

         }                                                                               \
     } while (0)
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
 typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
+typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
 #define QK4_0 32
+#define QR4_0 2
 typedef struct {
     float   d;              // delta
     uint8_t qs[QK4_0 / 2];  // nibbles / quants
 static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
 #define QK4_1 32
+#define QR4_1 2
 typedef struct {
     float   d;              // delta
     float   m;              // min
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
 #define QK5_0 32
+#define QR5_0 2
 typedef struct {
     half d;                 // delta
     uint8_t qh[4];          // 5-th bit of quants
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
 #define QK5_1 32
+#define QR5_1 2
 typedef struct {
     half d;                 // delta
     half m;                 // min
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 #define QK8_0 32
+#define QR8_0 1
 typedef struct {
     float   d;              // delta
     int8_t  qs[QK8_0];      // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+#define CUDA_DMMV_BLOCK_SIZE 32
+static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
     const block_q4_0 * x = (const block_q4_0 *) vx;
+    const float d = x[ib].d;
+    const uint8_t vui = x[ib].qs[iqs];
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+    v0 = (vi0 - 8)*d;
+    v1 = (vi1 - 8)*d;
+}
+static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+    const uint8_t vui = x[ib].qs[iqs];
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+    v0 = vi0*d + m;
+    v1 = vi1*d + m;
 }
+static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+    const float d = x[ib].d;
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+    v0 = x0*d;
+    v1 = x1*d;
+}
+static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+    v0 = x0*d + m;
+    v1 = x1*d + m;
 }
+static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+    const float d = x[ib].d;
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+    v0 = vi0*d;
+    v1 = vi1*d;
+}
+static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const half * x = (const half *) vx;
+    v0 = __half2float(x[ib + 0]);
+    v1 = __half2float(x[ib + 1]);
+}
+static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
+    static const int qk = QK4_0;
+    const block_q4_0 * x = (const block_q4_0 *) vx;
     const int i = blockIdx.x;
     const float d = x[i].d;
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf) - 8;
+        const int x1 = (x[i].qs[j] >>  4) - 8;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
+    }
+}
+static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
+    static const int qk = QK4_1;
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+    const int i = blockIdx.x;
+    const float d = x[i].d;
+    const float m = x[i].m;
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf);
+        const int x1 = (x[i].qs[j] >>  4);
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
+    static const int qk = QK5_0;
     const block_q5_0 * x = (const block_q5_0 *) vx;
     const int i = blockIdx.x;
     const float d = x[i].d;
     uint32_t qh;
     memcpy(&qh, x[i].qh, sizeof(qh));
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+        const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+        const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
     }
 }
 static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
+    static const int qk = QK5_1;
     const block_q5_1 * x = (const block_q5_1 *) vx;
     const int i = blockIdx.x;
     const float d = x[i].d;
     const float m = x[i].m;
     uint32_t qh;
     memcpy(&qh, x[i].qh, sizeof(qh));
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+        const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+        const int x1 = (x[i].qs[j] >>  4) | xh_1;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    static const int qk = QK8_0;
     const block_q8_0 * x = (const block_q8_0 *) vx;
     const int i = blockIdx.x;
     const float d = x[i].d;
+    for (int j = 0; j < qk; ++j) {
+        y[i*qk + j] = x[i].qs[j]*d;
+    }
+}
+template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int y_offset = qr == 1 ? 1 : qk/2;
+    __shared__ float tmp[block_size]; // separate sum for each thread
+    tmp[tid] = 0;
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+        // dequantize
+        float v0, v1;
+        dequantize_kernel(vx, ib, iqs, v0, v1);
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+    // sum up partial sums and write back result
+    __syncthreads();
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
     }
 }
     dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_0;
     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
     dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }
+static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
 // TODO: optimize
 static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
     const half * x = (const half *) vx;
     convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
 }
+static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
             return dequantize_row_q4_0_cuda;
         case GGML_TYPE_Q4_1:
             return dequantize_row_q4_1_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_row_q5_0_cuda;
         case GGML_TYPE_Q5_1:
     }
 }
+static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_mul_mat_vec_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_mul_mat_vec_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_mul_mat_vec_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_mul_mat_vec_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_mul_mat_vec_q8_0_cuda;
+        case GGML_TYPE_F16:
+            return convert_mul_mat_vec_f16_cuda;
+        default:
+            return nullptr;
+    }
+}
 // buffer pool for cuda
+#define MAX_CUDA_BUFFERS 256
 struct scoped_spin_lock {
     std::atomic_flag& lock;
     CUDA_CHECK(cudaFree(ptr));
 }
+#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
 #define GGML_CUDA_MAX_EVENTS 64
 static cublasHandle_t g_cublasH = nullptr;
 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
     const int nb2  = dst->nb[2];
     const int nb3  = dst->nb[3];
     const ggml_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1;
     const float alpha = 1.0f;
     const float beta = 0.0f;
     const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
     size_t x_size, y_size, d_size, q_size;
+    float * d_X = nullptr;
+    if (!mul_mat_vec) {
+        d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    }
     float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
     float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
     char  * d_Q = (char  *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
     const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
+    dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
     GGML_ASSERT(to_fp32_cuda != nullptr);
     for (int64_t i03 = 0; i03 < ne03; i03++) {
             cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
             cudaEvent_t  cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
             float * c_Y = d_Y + i * y_ne;
             float * c_D = d_D + i * d_ne;
             char  * c_Q = d_Q + i * q_sz;
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
+            } else if (src0->backend == GGML_BACKEND_CUDA) {
+                c_Q = ((char *) src0->data) + i * q_sz;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+                // copy src1 to device
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+                // wait for data
+                CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+                // compute
+                dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
+                CUDA_CHECK(cudaGetLastError());
+            } else { // general dequantization kernel + cuBLAS matrix matrix multiplication
+                float * c_X = d_X + i * x_ne;
+                // convert src0 to fp32 on device
+                to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
+                CUDA_CHECK(cudaGetLastError());
+                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+                // copy src1 to device
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+                // wait for conversion
+                CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+                // compute
+                CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+                CUBLAS_CHECK(
+                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            &alpha, c_X, ne00,
+                                    c_Y, ne10,
+                            &beta,  c_D, ne01));
+            }
             // copy dst to host
             float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
     }
     CUDA_CHECK(cudaDeviceSynchronize());
+    if (!mul_mat_vec) {
+        ggml_cuda_pool_free(d_X, x_size);
+    }
     ggml_cuda_pool_free(d_Y, y_size);
     ggml_cuda_pool_free(d_D, d_size);
     ggml_cuda_pool_free(d_Q, q_size);
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
         dst->type == GGML_TYPE_F32 &&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
         return true;
     }
         return 0;
     }
 }
+void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+    const ggml_type type = tensor->type;
+    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+    size_t q_size;
+    char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+    cudaStream_t cudaStream2 = g_cudaStreams2[0];
+    // copy tensor to device
+    CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    tensor->data = d_Q;
+    tensor->backend = GGML_BACKEND_CUDA;
+}

ggml-cuda.h CHANGED Viewed

@@ -14,6 +14,8 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
 #ifdef  __cplusplus
 }
 #endif

 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
+void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }
 #endif

ggml-opencl.c CHANGED Viewed

@@ -12,129 +12,129 @@
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
 const char * clblast_dequant = MULTILINE_QUOTE(
 struct block_q4_0
 {
     float d;
-    uchar qs[16];
 };
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
-    const uchar vi = blocks[i].qs[l];
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-struct block_q4_1
 {
     float d;
-    float m;
-    uchar qs[16];
 };
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
-    const uchar vi = blocks[i].qs[l];
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
 }
-struct block_q4_2
-{
-    ushort d;
-    uchar qs[8];
-};
-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-    const uchar vi = blocks[i].qs[l];
-    const uint index = i*16 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
 }
-struct block_q5_0
-{
-    float d;
-    uint qh;
-    uchar qs[16];
-};
-__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
-    const uchar vi = blocks[i].qs[l];
-    const uint l2 = l * 2;
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-    const uint index = i*32 + l2;
-    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
-    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
 }
-struct block_q5_1
-{
-    ushort d;
-    ushort m;
-    uint qh;
-    uchar qs[16];
-};
-__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-    const float m = vload_half(0, (__global half*) &blocks[i].m);
-    const uchar vi = blocks[i].qs[l];
-    const uint l2 = l * 2;
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-    const uint index = i*32 + l2;
-    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
-    result[index + 1] = ((vi >>  4) | vh1)*d + m;
 }
-struct block_q8_0
-{
-    float d;
-    char qs[32];
-};
-__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
 }
 );
@@ -148,26 +148,12 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
         }                                                                                       \
     } while (0)
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-typedef struct {
-    float d;                // delta
-    uint32_t qh;          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} cl_block_q5_0;
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
@@ -238,8 +224,6 @@ void ggml_cl_init(void) {
     CL_CHECK(err, "clCreateKernel");
     kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
     CL_CHECK(err, "clCreateKernel");
-    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
-    CL_CHECK(err, "clCreateKernel");
     kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
     CL_CHECK(err, "clCreateKernel");
     kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@@ -274,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
     cl_kernel kernel;
     size_t global = n * k, local, size_qb;
     bool dequant;
-    cl_block_q5_0* cl_host_b;
     switch (btype) {
     case GGML_TYPE_F32:
@@ -292,28 +275,11 @@ void ggml_cl_sgemm_wrapper(
         local = 16;
         size_qb = global * (sizeof(float) * 2 + local) / 32;
         break;
-    case GGML_TYPE_Q4_2:
-        dequant = true;
-        kernel = kernel_q4_2;
-        local = 8;
-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
-        break;
     case GGML_TYPE_Q5_0:
         dequant = true;
         kernel = kernel_q5_0;
         local = 16;
-        // For some reason OpenCL seems to be incapable of working with structs of size 22.
-        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
-        // TODO Find the reason, fix and remove workaround.
-        const block_q5_0* b = (const block_q5_0*) host_b;
-        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
-        for (size_t i = 0; i < global / 32; i++) {
-            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
-            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
-            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
-        }
-        host_b = (const float*) cl_host_b;
-        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
         break;
     case GGML_TYPE_Q5_1:
         dequant = true;
@@ -392,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
     clWaitForEvents(1, &ev_c);
     clReleaseEvent(ev_sgemm);
     clReleaseEvent(ev_c);
-    if (btype == GGML_TYPE_Q5_0) {
-        free((void*) cl_host_b);
-    }
 }

 #define MULTILINE_QUOTE(...) #__VA_ARGS__
 const char * clblast_dequant = MULTILINE_QUOTE(
+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+constant uint QK4_0 = 32;
 struct block_q4_0
 {
     float d;
+    uint8_t qs[QK4_0 / 2];
 };
+constant uint QK4_1 = 32;
+struct block_q4_1
+{
+    float d;
+    float m;
+    uint8_t qs[QK4_1 / 2];
+};
+constant uint QK5_0 = 32;
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+constant uint QK5_1 = 32;
+struct block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+constant uint QK8_0 = 32;
+struct block_q8_0
 {
     float d;
+    uint8_t qs[QK8_0];
 };
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
+    constant uint qk = QK4_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+    const float d = x[i].d;
+    const int x0 = (x[i].qs[j] & 0xf) - 8;
+    const int x1 = (x[i].qs[j] >>  4) - 8;
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
 }
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
+    constant uint qk = QK4_1;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+    const float d = x[i].d;
+    const float m = x[i].m;
+    const int x0 = (x[i].qs[j] & 0xf);
+    const int x1 = (x[i].qs[j] >>  4);
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
 }
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
+    constant uint qk = QK5_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    uint32_t qh = x[i].qh;
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
 }
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
+    constant uint qk = QK5_1;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
+    uint32_t qh = x[i].qh;
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+    const int x1 = (x[i].qs[j] >>  4) | xh_1;
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
 }
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
+    constant uint qk = QK8_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+    const float d = x[i].d;
+    y[i*qk + j] = x[i].qs[j]*d;
 }
 );
         }                                                                                       \
     } while (0)
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
     CL_CHECK(err, "clCreateKernel");
     kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
     CL_CHECK(err, "clCreateKernel");
     kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
     CL_CHECK(err, "clCreateKernel");
     kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
     cl_kernel kernel;
     size_t global = n * k, local, size_qb;
     bool dequant;
     switch (btype) {
     case GGML_TYPE_F32:
         local = 16;
         size_qb = global * (sizeof(float) * 2 + local) / 32;
         break;
     case GGML_TYPE_Q5_0:
         dequant = true;
         kernel = kernel_q5_0;
         local = 16;
+        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
         break;
     case GGML_TYPE_Q5_1:
         dequant = true;
     clWaitForEvents(1, &ev_c);
     clReleaseEvent(ev_sgemm);
     clReleaseEvent(ev_c);
 }

ggml.c CHANGED Viewed

The diff for this file is too large to render. See raw diff

ggml.h CHANGED Viewed

@@ -190,9 +190,12 @@
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
         GGML_TYPE_F16  = 1,
         GGML_TYPE_Q4_0 = 2,
         GGML_TYPE_Q4_1 = 3,
-        GGML_TYPE_Q4_2 = 4,
         // GGML_TYPE_Q4_3 (5) support has been removed
         GGML_TYPE_Q5_0 = 6,
         GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
         GGML_TYPE_COUNT,
     };
     // model file types
     enum ggml_ftype {
         GGML_FTYPE_UNKNOWN     = -1,
@@ -251,7 +259,6 @@ extern "C" {
         GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
         GGML_OP_DUP,
         GGML_OP_ADD,
         GGML_OP_SUB,
         GGML_OP_MUL,
         GGML_OP_DIV,
         GGML_OP_SQR,
         GGML_OP_SQRT,
         GGML_OP_SUM,
         GGML_OP_MEAN,
         GGML_OP_REPEAT,
         GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
         GGML_OP_RELU,
         GGML_OP_GELU,
         GGML_OP_SILU,
         GGML_OP_NORM, // normalize
         GGML_OP_RMS_NORM,
         GGML_OP_MUL_MAT,
         GGML_OP_SCALE,
         GGML_OP_CPY,
         GGML_OP_CONT,
         GGML_OP_RESHAPE,
@@ -291,9 +305,13 @@ extern "C" {
         GGML_OP_PERMUTE,
         GGML_OP_TRANSPOSE,
         GGML_OP_GET_ROWS,
         GGML_OP_DIAG_MASK_INF,
         GGML_OP_SOFT_MAX,
         GGML_OP_ROPE,
         GGML_OP_ALIBI,
         GGML_OP_CONV_1D_1S,
         GGML_OP_CONV_1D_2S,
@@ -322,7 +340,8 @@ extern "C" {
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type type;
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +372,7 @@ extern "C" {
         char name[32];
-        char padding[8]; // TODO: remove and add padding to name?
     };
     // computation graph
@@ -497,6 +516,29 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
     GGML_API struct ggml_tensor * ggml_sub(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -520,12 +562,24 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
     // return scalar
-    // TODO: compute sum along rows
     GGML_API struct ggml_tensor * ggml_sum(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
     // mean along rows
     GGML_API struct ggml_tensor * ggml_mean(
             struct ggml_context * ctx,
@@ -567,6 +621,13 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
     // normalize along rows
     // TODO: eps is hardcoded to 1e-5 for now
     GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +638,13 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
     // A: m rows, n columns
     // B: p rows, n columns (i.e. we transpose it internally)
     // result is m columns, p rows
@@ -589,12 +657,66 @@ extern "C" {
     // operations on tensors without backpropagation
     //
-    // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_scale(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
     // a -> b, return view(b)
     GGML_API struct ggml_tensor * ggml_cpy(
             struct ggml_context * ctx,
@@ -615,6 +737,11 @@ extern "C" {
     // return view(a)
     // TODO: when we start computing gradient, make a copy instead of view
     GGML_API struct ggml_tensor * ggml_reshape_2d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -630,6 +757,14 @@ extern "C" {
             int64_t               ne1,
             int64_t               ne2);
     // offset in bytes
     GGML_API struct ggml_tensor * ggml_view_1d(
             struct ggml_context * ctx,
@@ -655,6 +790,18 @@ extern "C" {
             size_t                nb2, // slice stride in bytes
             size_t                offset);
     GGML_API struct ggml_tensor * ggml_permute(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -673,20 +820,50 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
     // set elements above the diagonal to -INF
-    // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_diag_mask_inf(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   n_past);
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_soft_max(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
-    // rotary position embedding
     // in-place, returns view(a)
     // if mode & 1 == 1, skip n_past elements
     // if mode & 2 == 1, GPT-NeoX style
     // TODO: avoid creating a new tensor every time
@@ -697,6 +874,23 @@ extern "C" {
             int                   n_dims,
             int                   mode);
     // alibi position embedding
     // in-place, returns view(a)
     struct ggml_tensor * ggml_alibi(
@@ -741,13 +935,13 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_map_unary_f32(
             struct ggml_context        * ctx,
             struct ggml_tensor         * a,
-            const  ggml_unary_op_f32_t fun);
     GGML_API struct ggml_tensor * ggml_map_binary_f32(
             struct ggml_context         * ctx,
             struct ggml_tensor          * a,
             struct ggml_tensor          * b,
-            const  ggml_binary_op_f32_t fun);
     //
     // automatic differentiation
@@ -876,7 +1070,6 @@ extern "C" {
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);

 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
+#define GGML_QNT_VERSION        1    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
+#define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
         GGML_TYPE_F16  = 1,
         GGML_TYPE_Q4_0 = 2,
         GGML_TYPE_Q4_1 = 3,
+        // GGML_TYPE_Q4_2 = 4, support has been removed
         // GGML_TYPE_Q4_3 (5) support has been removed
         GGML_TYPE_Q5_0 = 6,
         GGML_TYPE_Q5_1 = 7,
         GGML_TYPE_COUNT,
     };
+    enum ggml_backend {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_CUDA = 1,
+    };
     // model file types
     enum ggml_ftype {
         GGML_FTYPE_UNKNOWN     = -1,
         GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
         GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
         GGML_OP_DUP,
         GGML_OP_ADD,
+        GGML_OP_ADD1,
+        GGML_OP_ACC,
         GGML_OP_SUB,
         GGML_OP_MUL,
         GGML_OP_DIV,
         GGML_OP_SQR,
         GGML_OP_SQRT,
+        GGML_OP_LOG,
         GGML_OP_SUM,
+        GGML_OP_SUM_ROWS,
         GGML_OP_MEAN,
         GGML_OP_REPEAT,
         GGML_OP_ABS,
         GGML_OP_RELU,
         GGML_OP_GELU,
         GGML_OP_SILU,
+        GGML_OP_SILU_BACK,
         GGML_OP_NORM, // normalize
         GGML_OP_RMS_NORM,
+        GGML_OP_RMS_NORM_BACK,
         GGML_OP_MUL_MAT,
         GGML_OP_SCALE,
+        GGML_OP_SET,
         GGML_OP_CPY,
         GGML_OP_CONT,
         GGML_OP_RESHAPE,
         GGML_OP_PERMUTE,
         GGML_OP_TRANSPOSE,
         GGML_OP_GET_ROWS,
+        GGML_OP_GET_ROWS_BACK,
+        GGML_OP_DIAG,
         GGML_OP_DIAG_MASK_INF,
+        GGML_OP_DIAG_MASK_ZERO,
         GGML_OP_SOFT_MAX,
         GGML_OP_ROPE,
+        GGML_OP_ROPE_BACK,
         GGML_OP_ALIBI,
         GGML_OP_CONV_1D_1S,
         GGML_OP_CONV_1D_2S,
     // n-dimensional tensor
     struct ggml_tensor {
+        enum ggml_type    type;
+        enum ggml_backend backend;
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
         char name[32];
+        char padding[16];
     };
     // computation graph
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
+    GGML_API struct ggml_tensor * ggml_add1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+    GGML_API struct ggml_tensor * ggml_acc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+    GGML_API struct ggml_tensor * ggml_acc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
     GGML_API struct ggml_tensor * ggml_sub(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_log(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_log_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
     // return scalar
     GGML_API struct ggml_tensor * ggml_sum(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_API struct ggml_tensor * ggml_sum_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
     // mean along rows
     GGML_API struct ggml_tensor * ggml_mean(
             struct ggml_context * ctx,
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_silu_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
     // normalize along rows
     // TODO: eps is hardcoded to 1e-5 for now
     GGML_API struct ggml_tensor * ggml_norm(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_rms_norm_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
     // A: m rows, n columns
     // B: p rows, n columns (i.e. we transpose it internally)
     // result is m columns, p rows
     // operations on tensors without backpropagation
     //
     GGML_API struct ggml_tensor * ggml_scale(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+    GGML_API struct ggml_tensor * ggml_set_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
     // a -> b, return view(b)
     GGML_API struct ggml_tensor * ggml_cpy(
             struct ggml_context * ctx,
     // return view(a)
     // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
     GGML_API struct ggml_tensor * ggml_reshape_2d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int64_t               ne1,
             int64_t               ne2);
+    GGML_API struct ggml_tensor * ggml_reshape_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
     // offset in bytes
     GGML_API struct ggml_tensor * ggml_view_1d(
             struct ggml_context * ctx,
             size_t                nb2, // slice stride in bytes
             size_t                offset);
+    GGML_API struct ggml_tensor * ggml_view_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
     GGML_API struct ggml_tensor * ggml_permute(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
+    GGML_API struct ggml_tensor * ggml_get_rows_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c);
+    GGML_API struct ggml_tensor * ggml_diag(
+        struct ggml_context     * ctx,
+        struct ggml_tensor      * a);
     // set elements above the diagonal to -INF
     GGML_API struct ggml_tensor * ggml_diag_mask_inf(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   n_past);
     // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+    // set elements above the diagonal to 0
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
     GGML_API struct ggml_tensor * ggml_soft_max(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
     // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    // rotary position embedding
     // if mode & 1 == 1, skip n_past elements
     // if mode & 2 == 1, GPT-NeoX style
     // TODO: avoid creating a new tensor every time
             int                   n_dims,
             int                   mode);
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_API struct ggml_tensor * ggml_rope_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
     // alibi position embedding
     // in-place, returns view(a)
     struct ggml_tensor * ggml_alibi(
     GGML_API struct ggml_tensor * ggml_map_unary_f32(
             struct ggml_context        * ctx,
             struct ggml_tensor         * a,
+                   ggml_unary_op_f32_t   fun);
     GGML_API struct ggml_tensor * ggml_map_binary_f32(
             struct ggml_context         * ctx,
             struct ggml_tensor          * a,
             struct ggml_tensor          * b,
+                   ggml_binary_op_f32_t   fun);
     //
     // automatic differentiation
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);

whisper.cpp CHANGED Viewed

@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,  1124ull*MB },
         },
     },
-    { GGML_TYPE_Q4_2,
-        {
-            { MODEL_TINY,     26ull*MB },
-            { MODEL_BASE,     50ull*MB },
-            { MODEL_SMALL,   154ull*MB },
-            { MODEL_MEDIUM,  470ull*MB },
-            { MODEL_LARGE,   940ull*MB },
-        },
-    },
     { GGML_TYPE_Q5_0,
         {
             { MODEL_TINY,     30ull*MB },
@@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             model.type = e_model::MODEL_LARGE;
         }
         // for the big tensors, we have the option to store the data in 16-bit floats or quantized
         // in order to save memory and also to speed up the computation
         wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
@@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
         fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
         fprintf(stderr, "%s: ftype         = %d\n", __func__, model.hparams.ftype);
         fprintf(stderr, "%s: type          = %d\n", __func__, model.type);
         // print memory requirements
@@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
         }
-        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
         fprintf(stderr, "%s: model ctx     = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
@@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
                             Qcur),
                         Qcur);
-                //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 // note: no bias for Key
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                         layer.attn_k_w,
                         cur);
-                //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                         layer.attn_v_w,
@@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
                 struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
                 struct ggml_tensor * KQ_scaled =
-                    ggml_scale(ctx0,
                             KQ,
                             ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
                             );
-                struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
                 struct ggml_tensor * V =
                     ggml_cpy(ctx0,
@@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
                 layer.cross_attn_k_w,
                 cur);
-            Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
             wstate.use_buf(ctx0, 1);
@@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
                         Qcur),
                     Qcur);
-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
             struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
-            Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // store key and value to memory
             {
@@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
             //            KQ,
             //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
@@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
                         Qcur),
                     Qcur);
-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // Kcross is already scaled
             struct ggml_tensor * Kcross =
@@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
             //            KQ,
             //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
             // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
@@ -4903,7 +4899,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
     // b: N*N*sizeof(float)
     // c: N*N*sizeof(float)
     // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
     // put a bunch of random data in the buffer
     for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -4911,7 +4907,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
     for (int j = 0; j < (int) sizes.size(); j++) {
         int n_q4_0 = 0;
         int n_q4_1 = 0;
-        int n_q4_2 = 0;
         int n_q5_0 = 0;
         int n_q5_1 = 0;
         int n_q8_0 = 0;
@@ -4921,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
         // GFLOPS/s
         double s_q4_0 = 0.0;
         double s_q4_1 = 0.0;
-        double s_q4_2 = 0.0;
         double s_q5_0 = 0.0;
         double s_q5_1 = 0.0;
         double s_q8_0 = 0.0;
@@ -4930,18 +4924,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
         const size_t N = sizes[j];
-        for (int k = 0; k < 8; ++k) {
             const ggml_type wtype =
                 k == 0 ? GGML_TYPE_Q4_0 :
                 k == 1 ? GGML_TYPE_Q4_1 :
-                k == 2 ? GGML_TYPE_Q4_2 :
-                k == 3 ? GGML_TYPE_Q5_0 :
-                k == 4 ? GGML_TYPE_Q5_1 :
-                k == 5 ? GGML_TYPE_Q8_0 :
-                k == 6 ? GGML_TYPE_F16  : GGML_TYPE_F32;
-            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
-            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
             struct ggml_init_params gparams = {
                 /*.mem_size   =*/ buf.size(),
@@ -4985,9 +4978,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
             s = ((2.0*N*N*N*n)/tsum)*1e-9;
         }
-        // Q4_0 | Q4_1 | Q4_2
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
         s += strbuf;
         // Q5_0 | Q5_1 | Q8_0

             { MODEL_LARGE,  1124ull*MB },
         },
     },
     { GGML_TYPE_Q5_0,
         {
             { MODEL_TINY,     30ull*MB },
             model.type = e_model::MODEL_LARGE;
         }
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
         // for the big tensors, we have the option to store the data in 16-bit floats or quantized
         // in order to save memory and also to speed up the computation
         wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
         fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
         fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
         fprintf(stderr, "%s: ftype         = %d\n", __func__, model.hparams.ftype);
+        fprintf(stderr, "%s: qntvr         = %d\n", __func__, qntvr);
         fprintf(stderr, "%s: type          = %d\n", __func__, model.type);
         // print memory requirements
             ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
         }
+        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
         fprintf(stderr, "%s: model ctx     = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
                             Qcur),
                         Qcur);
+                //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 // note: no bias for Key
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                         layer.attn_k_w,
                         cur);
+                //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                         layer.attn_v_w,
                 struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
                 struct ggml_tensor * KQ_scaled =
+                    ggml_scale_inplace(ctx0,
                             KQ,
                             ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
                             );
+                struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
                 struct ggml_tensor * V =
                     ggml_cpy(ctx0,
                 layer.cross_attn_k_w,
                 cur);
+            Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
             wstate.use_buf(ctx0, 1);
                         Qcur),
                     Qcur);
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
             struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
+            Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // store key and value to memory
             {
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale_inplace(ctx0,
             //            KQ,
             //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
                         Qcur),
                     Qcur);
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // Kcross is already scaled
             struct ggml_tensor * Kcross =
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale_inplace(ctx0,
             //            KQ,
             //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
             // no masking for cross-attention
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
     // b: N*N*sizeof(float)
     // c: N*N*sizeof(float)
     // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
+    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
     // put a bunch of random data in the buffer
     for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
     for (int j = 0; j < (int) sizes.size(); j++) {
         int n_q4_0 = 0;
         int n_q4_1 = 0;
         int n_q5_0 = 0;
         int n_q5_1 = 0;
         int n_q8_0 = 0;
         // GFLOPS/s
         double s_q4_0 = 0.0;
         double s_q4_1 = 0.0;
         double s_q5_0 = 0.0;
         double s_q5_1 = 0.0;
         double s_q8_0 = 0.0;
         const size_t N = sizes[j];
+        for (int k = 0; k < 7; ++k) {
             const ggml_type wtype =
                 k == 0 ? GGML_TYPE_Q4_0 :
                 k == 1 ? GGML_TYPE_Q4_1 :
+                k == 2 ? GGML_TYPE_Q5_0 :
+                k == 3 ? GGML_TYPE_Q5_1 :
+                k == 4 ? GGML_TYPE_Q8_0 :
+                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;
+            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
+            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
             struct ggml_init_params gparams = {
                 /*.mem_size   =*/ buf.size(),
             s = ((2.0*N*N*N*n)/tsum)*1e-9;
         }
+        // Q4_0 | Q4_1
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
         s += strbuf;
         // Q5_0 | Q5_1 | Q8_0