From 8e8daa845171b98f4477284c1997f53b58756a35 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 13 Sep 2023 19:59:16 +0300 Subject: [PATCH] metal : speed-up KQ multiplication --- ggml-metal.m | 3 +-- whisper.cpp | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 54c7f21d..89e4bbf9 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -856,8 +856,7 @@ void ggml_metal_graph_compute( // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && + if (ggml_is_contiguous(src1) && src1t == GGML_TYPE_F32 && [ctx->device supportsFamily:MTLGPUFamilyApple7] && ne00%32 == 0 && diff --git a/whisper.cpp b/whisper.cpp index 8f4fd714..dbb1abae 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -1688,7 +1688,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder( 0, 2, 1, 3); // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, ggml_cont(ctx0, Q)); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale); @@ -2089,7 +2089,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder( ggml_element_size(kv_self.k)*n_state*n_ctx*il); // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, ggml_cont(ctx0, Q)); //struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); @@ -2184,7 +2184,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder( 0, 2, 1, 3); // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, ggml_cont(ctx0, Q)); //struct ggml_tensor * KQ_scaled = // ggml_scale(ctx0,