ggml : sync latest ggml

- New Q4 and Q5 formats - Various improvements
2025-08-16 11:38:59 +02:00 · 2023-05-14 18:04:23 +03:00
parent d652cf12ec
commit e693074aa6
10 changed files with 4550 additions and 2140 deletions
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -6,7 +6,6 @@
 static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
-    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
    switch (ftype) {
        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
                    {
                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
-                case GGML_TYPE_Q4_2:
-                    {
-                        cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
                case GGML_TYPE_Q5_0:
                    {
                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -38,6 +38,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
+        } else if (arg == "-f" || arg == "--file") {
+            if (++i > argc) {
+                fprintf(stderr, "Invalid file param");
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, params);
@ -57,6 +71,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stderr, "                        load prompt from a file\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
@ -192,6 +208,10 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
    return result;
 }

+void gpt_vocab::add_special_token(const std::string & token) {
+    special_tokens.push_back(token);
+}
+
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;

@ -200,6 +220,20 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";

+        // Generate the subpattern from the special_tokens vector if it's not empty
+        if (!vocab.special_tokens.empty()) {
+            std::string special_tokens_subpattern;
+            for (const auto & token : vocab.special_tokens) {
+                if (!special_tokens_subpattern.empty()) {
+                    special_tokens_subpattern += "|";
+                }
+                special_tokens_subpattern += token;
+            }
+
+            // Modify the regex pattern with the generated special tokens subpattern
+            pat = special_tokens_subpattern + "|" + pat;
+        }
+
        std::regex re(pat);
        std::smatch m;

--- a/examples/common.h
+++ b/examples/common.h
@ -53,6 +53,9 @@ struct gpt_vocab {

    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
+    std::vector<std::string> special_tokens;
+
+    void add_special_token(const std::string & token);
 };

 // poor-man's JSON parsing
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -25,7 +25,7 @@ struct whisper_hparams {
    int32_t n_text_head   = 6;
    int32_t n_text_layer  = 4;
    int32_t n_mels        = 80;
-    int32_t f16           = 1;
+    int32_t ftype         = 1;
 };

 struct whisper_filters {
@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        finp.read((char *) &hparams.f16,           sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,         sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;

        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
+        fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
+        fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
+        fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, GGML_QNT_VERSION);

        fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
        fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
        fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
        fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
        fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        fout.write((char *) &ftype,                 sizeof(hparams.f16));
+        fout.write((char *) &ftype_dst,             sizeof(hparams.ftype));
    }

    // load mel filters
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -32,9 +32,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
        }                                                                               \
    } while (0)

+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
 typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
+typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization

 #define QK4_0 32
+#define QR4_0 2
 typedef struct {
    float   d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
@ -42,6 +48,7 @@ typedef struct {
 static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");

 #define QK4_1 32
+#define QR4_1 2
 typedef struct {
    float   d;              // delta
    float   m;              // min
@ -49,14 +56,8 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");

-#define QK4_2 16
-typedef struct {
-    half  d;                // delta
-    uint8_t qs[QK4_2 / 2];  // nibbles / quants
-} block_q4_2;
-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
-
 #define QK5_0 32
+#define QR5_0 2
 typedef struct {
    half d;                 // delta
    uint8_t qh[4];          // 5-th bit of quants
@ -65,6 +66,7 @@ typedef struct {
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");

 #define QK5_1 32
+#define QR5_1 2
 typedef struct {
    half d;                 // delta
    half m;                 // min
@ -74,36 +76,121 @@ typedef struct {
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");

 #define QK8_0 32
+#define QR8_0 1
 typedef struct {
    float   d;              // delta
    int8_t  qs[QK8_0];      // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");

+#define CUDA_DMMV_BLOCK_SIZE 32
+
+static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const float d = x[ib].d;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    v0 = (vi0 - 8)*d;
+    v1 = (vi1 - 8)*d;
+}
+
+static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    v0 = vi0*d + m;
+    v1 = vi1*d + m;
+}
+
+static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const float d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+
+    v0 = x0*d;
+    v1 = x1*d;
+}
+
+static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    v0 = x0*d + m;
+    v1 = x1*d + m;
+}
+
+static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const float d = x[ib].d;
+
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+
+    v0 = vi0*d;
+    v1 = vi1*d;
+}
+
+static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const half * x = (const half *) vx;
+
+    v0 = __half2float(x[ib + 0]);
+    v1 = __half2float(x[ib + 1]);
+}
+
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
+    static const int qk = QK4_0;
+
    const block_q4_0 * x = (const block_q4_0 *) vx;

    const int i = blockIdx.x;

    const float d = x[i].d;

-    const uint8_t * pp = x[i].qs;
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf) - 8;
+        const int x1 = (x[i].qs[j] >>  4) - 8;

-    for (int l = 0; l < QK4_0; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-
-        const float v0 = (vi0 - 8)*d;
-        const float v1 = (vi1 - 8)*d;
-
-        y[i*QK4_0 + l + 0] = v0;
-        y[i*QK4_0 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
    }
 }

 static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
+    static const int qk = QK4_1;
+
    const block_q4_1 * x = (const block_q4_1 *) vx;

    const int i = blockIdx.x;
@ -111,75 +198,42 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
    const float d = x[i].d;
    const float m = x[i].m;

-    const uint8_t * pp = x[i].qs;
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf);
+        const int x1 = (x[i].qs[j] >>  4);

-    for (int l = 0; l < QK4_1; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-
-        const float v0 = vi0*d + m;
-        const float v1 = vi1*d + m;
-
-        y[i*QK4_1 + l + 0] = v0;
-        y[i*QK4_1 + l + 1] = v1;
-    }
-}
-
-static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
-    const block_q4_2 * x = (const block_q4_2 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-
-    const uint8_t * pp = x[i].qs;
-
-    for (int l = 0; l < QK4_2; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-
-        const float v0 = (vi0 - 8)*d;
-        const float v1 = (vi1 - 8)*d;
-
-        y[i*QK4_2 + l + 0] = v0;
-        y[i*QK4_2 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
    }
 }

 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
+    static const int qk = QK5_0;
+
    const block_q5_0 * x = (const block_q5_0 *) vx;

    const int i = blockIdx.x;

    const float d = x[i].d;

-    const uint8_t * pp = x[i].qs;
-
    uint32_t qh;
    memcpy(&qh, x[i].qh, sizeof(qh));

-    for (int l = 0; l < QK5_0; l += 2) {
-        const uint8_t vi = pp[l/2];
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;

-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+        const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+        const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;

-        const int8_t vi0 = ((vi & 0xf) | vh0);
-        const int8_t vi1 = ((vi >>  4) | vh1);
-
-        const float v0 = (vi0 - 16)*d;
-        const float v1 = (vi1 - 16)*d;
-
-        y[i*QK5_0 + l + 0] = v0;
-        y[i*QK5_0 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
    }
 }

 static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
+    static const int qk = QK5_1;
+
    const block_q5_1 * x = (const block_q5_1 *) vx;

    const int i = blockIdx.x;
@ -187,41 +241,70 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
    const float d = x[i].d;
    const float m = x[i].m;

-    const uint8_t * pp = x[i].qs;
-
    uint32_t qh;
    memcpy(&qh, x[i].qh, sizeof(qh));

-    for (int l = 0; l < QK5_1; l += 2) {
-        const uint8_t vi = pp[l/2];
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;

-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+        const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+        const int x1 = (x[i].qs[j] >>  4) | xh_1;

-        const int8_t vi0 = (vi & 0xf) | vh0;
-        const int8_t vi1 = (vi >>  4) | vh1;
-
-        const float v0 = vi0*d + m;
-        const float v1 = vi1*d + m;
-
-        y[i*QK5_1 + l + 0] = v0;
-        y[i*QK5_1 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
    }
 }

 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    static const int qk = QK8_0;
+
    const block_q8_0 * x = (const block_q8_0 *) vx;

    const int i = blockIdx.x;

    const float d = x[i].d;

-    const int8_t * pp = x[i].qs;
+    for (int j = 0; j < qk; ++j) {
+        y[i*qk + j] = x[i].qs[j]*d;
+    }
+}

-    for (int l = 0; l < QK8_0; l++) {
-        const int8_t vi = pp[l];
+template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;

-        y[i*QK8_0 + l] = vi*d;
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    __shared__ float tmp[block_size]; // separate sum for each thread
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_kernel(vx, ib, iqs, v0, v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    __syncthreads();
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
    }
 }

@ -235,11 +318,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
    dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
 }

-static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK4_2;
-    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
-}
-
 static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK5_0;
    dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
@ -255,6 +333,36 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }

+static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
 // TODO: optimize
 static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
    const half * x = (const half *) vx;
@ -268,14 +376,18 @@ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStre
    convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
 }

+static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
            return dequantize_row_q4_0_cuda;
        case GGML_TYPE_Q4_1:
            return dequantize_row_q4_1_cuda;
-        case GGML_TYPE_Q4_2:
-            return dequantize_row_q4_2_cuda;
        case GGML_TYPE_Q5_0:
            return dequantize_row_q5_0_cuda;
        case GGML_TYPE_Q5_1:
@ -289,8 +401,27 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
    }
 }

+static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_mul_mat_vec_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_mul_mat_vec_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_mul_mat_vec_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_mul_mat_vec_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_mul_mat_vec_q8_0_cuda;
+        case GGML_TYPE_F16:
+            return convert_mul_mat_vec_f16_cuda;
+        default:
+            return nullptr;
+    }
+}
+
 // buffer pool for cuda
-#define MAX_CUDA_BUFFERS 16
+#define MAX_CUDA_BUFFERS 256

 struct scoped_spin_lock {
    std::atomic_flag& lock;
@ -348,7 +479,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
    CUDA_CHECK(cudaFree(ptr));
 }

-#define GGML_CUDA_MAX_STREAMS 8
+#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
 #define GGML_CUDA_MAX_EVENTS 64
 static cublasHandle_t g_cublasH = nullptr;
 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
@ -587,6 +718,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    const ggml_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1;

    const float alpha = 1.0f;
    const float beta = 0.0f;
@ -597,12 +729,16 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);

    size_t x_size, y_size, d_size, q_size;
-    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    float * d_X = nullptr;
+    if (!mul_mat_vec) {
+        d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    }
    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
    char  * d_Q = (char  *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);

    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
+    dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
    GGML_ASSERT(to_fp32_cuda != nullptr);

    for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -612,31 +748,54 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
            cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
            cudaEvent_t  cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];

-            float * c_X = d_X + i * x_ne;
            float * c_Y = d_Y + i * y_ne;
            float * c_D = d_D + i * d_ne;
            char  * c_Q = d_Q + i * q_sz;

-            // copy src0 and convert to fp32 on device
-            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
-            to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
-            CUDA_CHECK(cudaGetLastError());
-            CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
+            } else if (src0->backend == GGML_BACKEND_CUDA) {
+                c_Q = ((char *) src0->data) + i * q_sz;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));

-            // copy src1 to device
-            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+                // copy src1 to device
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));

-            // wait for conversion
-            CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+                // wait for data
+                CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));

-            // compute
-            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
-            CUBLAS_CHECK(
-                cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                        ne01, ne11, ne10,
-                        &alpha, c_X, ne00,
-                                c_Y, ne10,
-                        &beta,  c_D, ne01));
+                // compute
+                dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
+                CUDA_CHECK(cudaGetLastError());
+
+            } else { // general dequantization kernel + cuBLAS matrix matrix multiplication
+                float * c_X = d_X + i * x_ne;
+
+                // convert src0 to fp32 on device
+                to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
+                CUDA_CHECK(cudaGetLastError());
+                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+
+                // copy src1 to device
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+                // wait for conversion
+                CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+
+                // compute
+                CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+                CUBLAS_CHECK(
+                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            &alpha, c_X, ne00,
+                                    c_Y, ne10,
+                            &beta,  c_D, ne01));
+            }

            // copy dst to host
            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
@ -645,7 +804,9 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    }

    CUDA_CHECK(cudaDeviceSynchronize());
-    ggml_cuda_pool_free(d_X, x_size);
+    if (!mul_mat_vec) {
+        ggml_cuda_pool_free(d_X, x_size);
+    }
    ggml_cuda_pool_free(d_Y, y_size);
    ggml_cuda_pool_free(d_D, d_size);
    ggml_cuda_pool_free(d_Q, q_size);
@ -661,8 +822,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
        dst->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
        return true;
    }

@ -714,3 +874,25 @@ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct
        return 0;
    }
 }
+
+void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+
+    const ggml_type type = tensor->type;
+    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+
+    size_t q_size;
+    char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+
+    cudaStream_t cudaStream2 = g_cudaStreams2[0];
+
+    // copy tensor to device
+    CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    tensor->data = d_Q;
+    tensor->backend = GGML_BACKEND_CUDA;
+}
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -14,6 +14,8 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);

+void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@ -12,129 +12,129 @@
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
 const char * clblast_dequant = MULTILINE_QUOTE(

+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+constant uint QK4_0 = 32;
 struct block_q4_0
 {
    float d;
-    uchar qs[16];
+    uint8_t qs[QK4_0 / 2];
 };

-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
+constant uint QK4_1 = 32;
 struct block_q4_1
 {
    float d;
    float m;
-    uchar qs[16];
+    uint8_t qs[QK4_1 / 2];
 };

-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
-}
-
-struct block_q4_2
+constant uint QK5_0 = 32;
+struct __attribute__ ((packed)) block_q5_0
 {
-    ushort d;
-    uchar qs[8];
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
 };

-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*16 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
-
-struct block_q5_0
-{
-    float d;
-    uint qh;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint l2 = l * 2;
-
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-
-    const uint index = i*32 + l2;
-    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
-    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
-}
-
+constant uint QK5_1 = 32;
 struct block_q5_1
 {
-    ushort d;
-    ushort m;
-    uint qh;
-    uchar qs[16];
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
 };

-__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-    const float m = vload_half(0, (__global half*) &blocks[i].m);
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint l2 = l * 2;
-
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-
-    const uint index = i*32 + l2;
-    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
-    result[index + 1] = ((vi >>  4) | vh1)*d + m;
-}
-
+constant uint QK8_0 = 32;
 struct block_q8_0
 {
    float d;
-    char qs[32];
+    uint8_t qs[QK8_0];
 };

-__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);

-    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
+    constant uint qk = QK4_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+
+    const int x0 = (x[i].qs[j] & 0xf) - 8;
+    const int x1 = (x[i].qs[j] >>  4) - 8;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
+    constant uint qk = QK4_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const int x0 = (x[i].qs[j] & 0xf);
+    const int x1 = (x[i].qs[j] >>  4);
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
+    constant uint qk = QK5_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
+    constant uint qk = QK5_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+    const int x1 = (x[i].qs[j] >>  4) | xh_1;
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
+    constant uint qk = QK8_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    y[i*qk + j] = x[i].qs[j]*d;
 }

 );
@ -148,26 +148,12 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
        }                                                                                       \
    } while (0)

-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-
-typedef struct {
-    float d;                // delta
-    uint32_t qh;          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} cl_block_q5_0;
-
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;

@ -238,8 +224,6 @@ void ggml_cl_init(void) {
    CL_CHECK(err, "clCreateKernel");
    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
    CL_CHECK(err, "clCreateKernel");
-    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
-    CL_CHECK(err, "clCreateKernel");
    kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
    CL_CHECK(err, "clCreateKernel");
    kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@ -274,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
    cl_kernel kernel;
    size_t global = n * k, local, size_qb;
    bool dequant;
-    cl_block_q5_0* cl_host_b;

    switch (btype) {
    case GGML_TYPE_F32:
@ -292,28 +275,11 @@ void ggml_cl_sgemm_wrapper(
        local = 16;
        size_qb = global * (sizeof(float) * 2 + local) / 32;
        break;
-    case GGML_TYPE_Q4_2:
-        dequant = true;
-        kernel = kernel_q4_2;
-        local = 8;
-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
-        break;
    case GGML_TYPE_Q5_0:
        dequant = true;
        kernel = kernel_q5_0;
        local = 16;
-        // For some reason OpenCL seems to be incapable of working with structs of size 22.
-        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
-        // TODO Find the reason, fix and remove workaround.
-        const block_q5_0* b = (const block_q5_0*) host_b;
-        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
-        for (size_t i = 0; i < global / 32; i++) {
-            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
-            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
-            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
-        }
-        host_b = (const float*) cl_host_b;
-        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
+        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
        break;
    case GGML_TYPE_Q5_1:
        dequant = true;
@ -392,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
    clWaitForEvents(1, &ev_c);
    clReleaseEvent(ev_sgemm);
    clReleaseEvent(ev_c);
-    if (btype == GGML_TYPE_Q5_0) {
-        free((void*) cl_host_b);
-    }
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -190,9 +190,12 @@
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1

+#define GGML_QNT_VERSION        1    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
+#define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
@ -231,7 +234,7 @@ extern "C" {
        GGML_TYPE_F16  = 1,
        GGML_TYPE_Q4_0 = 2,
        GGML_TYPE_Q4_1 = 3,
-        GGML_TYPE_Q4_2 = 4,
+        // GGML_TYPE_Q4_2 = 4, support has been removed
        // GGML_TYPE_Q4_3 (5) support has been removed
        GGML_TYPE_Q5_0 = 6,
        GGML_TYPE_Q5_1 = 7,
@ -243,6 +246,11 @@ extern "C" {
        GGML_TYPE_COUNT,
    };

+    enum ggml_backend {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_CUDA = 1,
+    };
+
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
@ -251,7 +259,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
@ -263,12 +270,16 @@ extern "C" {

        GGML_OP_DUP,
        GGML_OP_ADD,
+        GGML_OP_ADD1,
+        GGML_OP_ACC,
        GGML_OP_SUB,
        GGML_OP_MUL,
        GGML_OP_DIV,
        GGML_OP_SQR,
        GGML_OP_SQRT,
+        GGML_OP_LOG,
        GGML_OP_SUM,
+        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
        GGML_OP_REPEAT,
        GGML_OP_ABS,
@ -278,12 +289,15 @@ extern "C" {
        GGML_OP_RELU,
        GGML_OP_GELU,
        GGML_OP_SILU,
+        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
+        GGML_OP_RMS_NORM_BACK,

        GGML_OP_MUL_MAT,

        GGML_OP_SCALE,
+        GGML_OP_SET,
        GGML_OP_CPY,
        GGML_OP_CONT,
        GGML_OP_RESHAPE,
@ -291,9 +305,13 @@ extern "C" {
        GGML_OP_PERMUTE,
        GGML_OP_TRANSPOSE,
        GGML_OP_GET_ROWS,
+        GGML_OP_GET_ROWS_BACK,
+        GGML_OP_DIAG,
        GGML_OP_DIAG_MASK_INF,
+        GGML_OP_DIAG_MASK_ZERO,
        GGML_OP_SOFT_MAX,
        GGML_OP_ROPE,
+        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
        GGML_OP_CONV_1D_1S,
        GGML_OP_CONV_1D_2S,
@ -322,7 +340,8 @@ extern "C" {

    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type type;
+        enum ggml_type    type;
+        enum ggml_backend backend;

        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -353,7 +372,7 @@ extern "C" {

        char name[32];

-        char padding[8]; // TODO: remove and add padding to name?
+        char padding[16];
    };

    // computation graph
@ -497,6 +516,29 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_add1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_acc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_acc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
    GGML_API struct ggml_tensor * ggml_sub(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -520,12 +562,24 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_log(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // return scalar
-    // TODO: compute sum along rows
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_API struct ggml_tensor * ggml_sum_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // mean along rows
    GGML_API struct ggml_tensor * ggml_mean(
            struct ggml_context * ctx,
@ -567,6 +621,13 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_silu_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // normalize along rows
    // TODO: eps is hardcoded to 1e-5 for now
    GGML_API struct ggml_tensor * ggml_norm(
@ -577,6 +638,13 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_rms_norm_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // A: m rows, n columns
    // B: p rows, n columns (i.e. we transpose it internally)
    // result is m columns, p rows
@ -589,12 +657,66 @@ extern "C" {
    // operations on tensors without backpropagation
    //

-    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+
    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
            struct ggml_context * ctx,
@ -615,6 +737,11 @@ extern "C" {

    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
    GGML_API struct ggml_tensor * ggml_reshape_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -630,6 +757,14 @@ extern "C" {
            int64_t               ne1,
            int64_t               ne2);

+    GGML_API struct ggml_tensor * ggml_reshape_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
    // offset in bytes
    GGML_API struct ggml_tensor * ggml_view_1d(
            struct ggml_context * ctx,
@ -655,6 +790,18 @@ extern "C" {
            size_t                nb2, // slice stride in bytes
            size_t                offset);

+    GGML_API struct ggml_tensor * ggml_view_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
+
    GGML_API struct ggml_tensor * ggml_permute(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -673,20 +820,50 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_get_rows_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c);
+
+    GGML_API struct ggml_tensor * ggml_diag(
+        struct ggml_context     * ctx,
+        struct ggml_tensor      * a);
+
    // set elements above the diagonal to -INF
-    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);

    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // set elements above the diagonal to 0
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
    GGML_API struct ggml_tensor * ggml_soft_max(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // rotary position embedding
    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // rotary position embedding
    // if mode & 1 == 1, skip n_past elements
    // if mode & 2 == 1, GPT-NeoX style
    // TODO: avoid creating a new tensor every time
@ -697,6 +874,23 @@ extern "C" {
            int                   n_dims,
            int                   mode);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_API struct ggml_tensor * ggml_rope_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
    // alibi position embedding
    // in-place, returns view(a)
    struct ggml_tensor * ggml_alibi(
@ -741,13 +935,13 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-            const  ggml_unary_op_f32_t fun);
+                   ggml_unary_op_f32_t   fun);

    GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-            const  ggml_binary_op_f32_t fun);
+                   ggml_binary_op_f32_t   fun);

    //
    // automatic differentiation
@ -876,7 +1070,6 @@ extern "C" {

    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
--- a/whisper.cpp
+++ b/whisper.cpp
@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
            { MODEL_LARGE,  1124ull*MB },
        },
    },
-    { GGML_TYPE_Q4_2,
-        {
-            { MODEL_TINY,     26ull*MB },
-            { MODEL_BASE,     50ull*MB },
-            { MODEL_SMALL,   154ull*MB },
-            { MODEL_MEDIUM,  470ull*MB },
-            { MODEL_LARGE,   940ull*MB },
-        },
-    },
    { GGML_TYPE_Q5_0,
        {
            { MODEL_TINY,     30ull*MB },
@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            model.type = e_model::MODEL_LARGE;
        }

+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+
        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
        // in order to save memory and also to speed up the computation
        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
        fprintf(stderr, "%s: ftype         = %d\n", __func__, model.hparams.ftype);
+        fprintf(stderr, "%s: qntvr         = %d\n", __func__, qntvr);
        fprintf(stderr, "%s: type          = %d\n", __func__, model.type);

        // print memory requirements
@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
        }

-        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
+        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead

        fprintf(stderr, "%s: model ctx     = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }
@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
                            Qcur),
                        Qcur);

-                //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

                // note: no bias for Key
                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                        layer.attn_k_w,
                        cur);

-                //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                        layer.attn_v_w,
@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

                struct ggml_tensor * KQ_scaled =
-                    ggml_scale(ctx0,
+                    ggml_scale_inplace(ctx0,
                            KQ,
                            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
                            );

-                struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
+                struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);

                struct ggml_tensor * V =
                    ggml_cpy(ctx0,
@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
                layer.cross_attn_k_w,
                cur);

-            Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
+            Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));

            wstate.use_buf(ctx0, 1);

@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
                        Qcur),
                    Qcur);

-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

            // note: no bias for Key
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                    layer.attn_k_w,
                    cur);

-            Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

            // store key and value to memory
            {
@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
+            //    ggml_scale_inplace(ctx0,
            //            KQ,
            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
            //            );

-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);

-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

            struct ggml_tensor * V =
                ggml_view_3d(ctx0, kv_self.v,
@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
                        Qcur),
                    Qcur);

-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

            // Kcross is already scaled
            struct ggml_tensor * Kcross =
@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
+            //    ggml_scale_inplace(ctx0,
            //            KQ,
            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
            //            );

            // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);

            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);

@ -4903,7 +4899,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
    // b: N*N*sizeof(float)
    // c: N*N*sizeof(float)
    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
+    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);

    // put a bunch of random data in the buffer
    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@ -4911,7 +4907,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
    for (int j = 0; j < (int) sizes.size(); j++) {
        int n_q4_0 = 0;
        int n_q4_1 = 0;
-        int n_q4_2 = 0;
        int n_q5_0 = 0;
        int n_q5_1 = 0;
        int n_q8_0 = 0;
@ -4921,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
        // GFLOPS/s
        double s_q4_0 = 0.0;
        double s_q4_1 = 0.0;
-        double s_q4_2 = 0.0;
        double s_q5_0 = 0.0;
        double s_q5_1 = 0.0;
        double s_q8_0 = 0.0;
@ -4930,18 +4924,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {

        const size_t N = sizes[j];

-        for (int k = 0; k < 8; ++k) {
+        for (int k = 0; k < 7; ++k) {
            const ggml_type wtype =
                k == 0 ? GGML_TYPE_Q4_0 :
                k == 1 ? GGML_TYPE_Q4_1 :
-                k == 2 ? GGML_TYPE_Q4_2 :
-                k == 3 ? GGML_TYPE_Q5_0 :
-                k == 4 ? GGML_TYPE_Q5_1 :
-                k == 5 ? GGML_TYPE_Q8_0 :
-                k == 6 ? GGML_TYPE_F16  : GGML_TYPE_F32;
+                k == 2 ? GGML_TYPE_Q5_0 :
+                k == 3 ? GGML_TYPE_Q5_1 :
+                k == 4 ? GGML_TYPE_Q8_0 :
+                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;

-            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
-            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
+            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
+            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;

            struct ggml_init_params gparams = {
                /*.mem_size   =*/ buf.size(),
@ -4985,9 +4978,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            s = ((2.0*N*N*N*n)/tsum)*1e-9;
        }

-        // Q4_0 | Q4_1 | Q4_2
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
+        // Q4_0 | Q4_1
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
        s += strbuf;

        // Q5_0 | Q5_1 | Q8_0