release : v1.5.4

fix : cuda order of synchronization when setting a buffer (ggml/679)
* fix : cuda order of synchronization when setting a buffer * also sync before memcpy --------- Co-authored-by: slaren <slarengh@gmail.com>
2025-07-02 23:41:28 +02:00 · 2024-01-05 17:11:27 +02:00 · 2024-01-05 17:01:59 +02:00 · 2024-01-05 16:31:30 +02:00 · 2024-01-05 16:31:20 +02:00 · 2024-01-04 16:28:30 +02:00
13 changed files with 17 additions and 27 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.5)

-project(whisper.cpp VERSION 1.5.3)
+project(whisper.cpp VERSION 1.5.4)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.5.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.3) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.5.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.5.3",
+  "version": "1.5.4",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -24,9 +24,9 @@ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {

    // select which device to run the Core ML model on
    MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
-    config.computeUnits = MLComputeUnitsCPUAndGPU;
+    // config.computeUnits = MLComputeUnitsCPUAndGPU;
    //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
-    //config.computeUnits = MLComputeUnitsAll;
+    config.computeUnits = MLComputeUnitsAll;

    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);

--- a/examples/whisper.swiftui/.gitignore
+++ b/examples/whisper.swiftui/.gitignore
@ -0,0 +1,2 @@
+xcuserdata
+xcshareddata
--- a/extra/sync-ggml.last
+++ b/extra/sync-ggml.last
@ -1 +1 @@
-3fd01e00e40583ccd4b393a7c6502d6a4455a1d5
+3eace58911ea8d2cf35defdc59848d99b91a57f5
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -9689,8 +9689,8 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaDeviceSynchronize());
-
    CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaDeviceSynchronize());
 }

 static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -258,14 +258,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
        NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
+        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
        if (libPath != nil) {
            // pre-compiled library found
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
        } else {
-            GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
+            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);

            NSString * sourcePath;
            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
@ -295,7 +295,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #endif
            // try to disable fast-math
            // NOTE: this seems to have no effect whatsoever
-            //       instead, in order to disable fast-math, we have to build ggml.metallib from the command line
+            //       instead, in order to disable fast-math, we have to build default.metallib from the command line
            //       using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
            //       and go through the "pre-compiled library found" path above
            //[options setFastMathEnabled:false];
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
-// Effectively 2.5625 bits per weight
+// Effectively 2.625 bits per weight
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -143,20 +143,7 @@ class AudioEncoderANE(AudioEncoder):
            x = block(x)

        x = self.ln_post(x)
-
-        # """
-        # TODO:
-        # I think we need to transpose the result here to make it fit whisper.cpp memory order.
-        # However, even doing this, the results are still wrong. Kind of less wrong compared to
-        # not transposing, but still wrong.
-
-        # Also, I don't know why the original OpenAI implementation does not need to transpose
-
-        # transpose to (batch_size, n_ctx, n_state)
-        # x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
-
-        # """
-        # x = x.transpose(1,3)
+        x = x.squeeze(2).transpose(1, 2)

        return x

--- a/models/generate-coreml-model.sh
+++ b/models/generate-coreml-model.sh
@ -23,7 +23,7 @@ if [[ $mname == "-h5" ]]; then
  echo $mpath
  python3 models/convert-h5-to-coreml.py --model-name $mname --model-path $mpath --encoder-only True
 else
-  python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+  python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True  --optimize-ane True
 fi

 xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
--- a/whisper.cpp
+++ b/whisper.cpp
@ -3821,6 +3821,7 @@ void whisper_reset_timings(struct whisper_context * ctx) {
        ctx->state->t_sample_us = 0;
        ctx->state->t_encode_us = 0;
        ctx->state->t_decode_us = 0;
+        ctx->state->t_batchd_us = 0;
        ctx->state->t_prompt_us = 0;
        ctx->state->n_sample = 0;
        ctx->state->n_encode = 0;
Author	SHA1	Message	Date
Georgi Gerganov	0b9af32a8b	release : v1.5.4	2024-01-05 17:11:27 +02:00
Erik Scholz	11b1b63b14	fix : cuda order of synchronization when setting a buffer (ggml/679) * fix : cuda order of synchronization when setting a buffer * also sync before memcpy --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-01-05 17:01:59 +02:00
Georgi Gerganov	0e26a6c92e	metal : switch back to default.metallib (ggml/681) ggml-ci	2024-01-05 16:31:30 +02:00
Georgi Gerganov	66d8f0b7f1	ggml : fix q2_k bpw in comments (ggml/680)	2024-01-05 16:31:20 +02:00
Yajing Tang	ba5bcde874	coreml : fix ANE optimized encoder (#1716 )	2024-01-04 16:28:30 +02:00
Georgi Gerganov	ab0a8593c5	whisper.swiftui : add .gitignore	2024-01-04 15:00:27 +02:00
Georgi Gerganov	668ffc9b23	whispser : reset the "batched" timings (#1721 )	2024-01-04 13:38:39 +02:00