From 8e8daa845171b98f4477284c1997f53b58756a35 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 13 Sep 2023 19:59:16 +0300
Subject: [PATCH] metal : speed-up KQ multiplication

---
 ggml-metal.m | 3 +--
 whisper.cpp  | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 54c7f21d..89e4bbf9 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -856,8 +856,7 @@ void ggml_metal_graph_compute(
 
                             // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                             // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if (ggml_is_contiguous(src0) &&
-                                ggml_is_contiguous(src1) &&
+                            if (ggml_is_contiguous(src1) &&
                                 src1t == GGML_TYPE_F32 &&
                                 [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                                 ne00%32 == 0 &&
diff --git a/whisper.cpp b/whisper.cpp
index 8f4fd714..dbb1abae 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1688,7 +1688,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
                         0, 2, 1, 3);
 
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, ggml_cont(ctx0, Q));
 
             struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale);
 
@@ -2089,7 +2089,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
                         ggml_element_size(kv_self.k)*n_state*n_ctx*il);
 
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, ggml_cont(ctx0, Q));
 
             //struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
 
@@ -2184,7 +2184,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
                         0, 2, 1, 3);
 
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, ggml_cont(ctx0, Q));
 
             //struct ggml_tensor * KQ_scaled =
             //    ggml_scale(ctx0,