ggml : use sched_yield when using BLAS + add comment

whisper : fix bench regression
whisper : faster beam_search sampling via reduced KV cache copies (#1243 )
2025-07-04 16:30:58 +02:00 · 2023-09-12 13:33:09 +03:00 · 2023-09-12 11:21:02 +03:00 · 2023-09-10 16:04:27 +03:00 · 2023-09-09 18:55:51 +03:00 · 2023-09-08 20:50:15 +03:00
33 changed files with 526 additions and 204 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -428,15 +428,15 @@ jobs:
      - name: Publish package
        if: ${{ github.ref == 'refs/heads/master' }}
-        uses: gradle/gradle-build-action@v2
+        uses: gradle/gradle-build-action@v2.4.2
        with:
          arguments: publish
          build-root-directory: bindings/java
        env:
          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
-          # MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
+          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
-          # MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
+          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
  quantize:
    runs-on: ubuntu-latest
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@ build/
 build-em/
 build-debug/
 build-release/
 build-rwdi/
 build-static/
 build-cublas/
 build-no-accel/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -321,6 +321,53 @@ else()
    endif()
 endif()
 #
 # POSIX conformance
 #
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
 # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
 add_compile_definitions(_XOPEN_SOURCE=600)
 # Somehow in OpenBSD whenever POSIX conformance is specified
 # some string functions rely on locale_t availability,
 # which was introduced in POSIX.1-2008, forcing us to go higher
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
    remove_definitions(-D_XOPEN_SOURCE=600)
    add_compile_definitions(_XOPEN_SOURCE=700)
 endif()
 # Data types, macros and functions related to controlling CPU affinity
 # are available on Linux through GNU extensions in libc
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
    add_compile_definitions(_GNU_SOURCE)
 endif()
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
 # and on macOS its availability depends on enabling Darwin extensions
 # similarly on DragonFly, enabling BSD extensions is necessary
 if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
    add_compile_definitions(_DARWIN_C_SOURCE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
    add_compile_definitions(_DARWIN_C_SOURCE)
 endif()
 # alloca is a non-standard interface that is not visible on BSDs when
 # POSIX conformance is specified, but not all of them provide a clean way
 # to enable it in such cases
 if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
    add_compile_definitions(__BSD_VISIBLE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
    add_compile_definitions(_NETBSD_SOURCE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
    add_compile_definitions(_BSD_SOURCE)
 endif()
 if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()
--- a/47
+++ b/47
@ -42,18 +42,55 @@ CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =
-# ref: https://github.com/ggerganov/whisper.cpp/issues/37
+# clock_gettime came in POSIX.1b (1993)
-ifneq ($(wildcard /usr/include/musl/*),)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-	CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
+# posix_memalign came in POSIX.1-2001 / SUSv3
-	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
 CFLAGS   += -D_XOPEN_SOURCE=600
 CXXFLAGS += -D_XOPEN_SOURCE=600
 # Somehow in OpenBSD whenever POSIX conformance is specified
 # some string functions rely on locale_t availability,
 # which was introduced in POSIX.1-2008, forcing us to go higher
 ifeq ($(UNAME_S),OpenBSD)
 	CFLAGS   += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
 	CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
 endif
 # Data types, macros and functions related to controlling CPU affinity
 # are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
 	CFLAGS   += -D_GNU_SOURCE
 	CXXFLAGS += -D_GNU_SOURCE
 endif
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
 # and on macOS its availability depends on enabling Darwin extensions
 # similarly on DragonFly, enabling BSD extensions is necessary
 ifeq ($(UNAME_S),Darwin)
 	CFLAGS   += -D_DARWIN_C_SOURCE
 	CXXFLAGS += -D_DARWIN_C_SOURCE
 endif
 ifeq ($(UNAME_S),DragonFly)
 	CFLAGS   += -D__BSD_VISIBLE
 	CXXFLAGS += -D__BSD_VISIBLE
 endif
 # alloca is a non-standard interface that is not visible on BSDs when
 # POSIX conformance is specified, but not all of them provide a clean way
 # to enable it in such cases
 ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -D__BSD_VISIBLE
 	CXXFLAGS += -D__BSD_VISIBLE
 endif
 ifeq ($(UNAME_S),NetBSD)
 	CFLAGS   += -D_NETBSD_SOURCE
 	CXXFLAGS += -D_NETBSD_SOURCE
 endif
 ifeq ($(UNAME_S),OpenBSD)
 	CFLAGS   += -D_BSD_SOURCE
 	CXXFLAGS += -D_BSD_SOURCE
 endif
 # OS specific
 # TODO: support Windows
@ -67,7 +104,7 @@ endif
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	ifeq ($(UNAME_S),Darwin)
-		CPUINFO_CMD := sysctl machdep.cpu.features
+		CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
 	else ifeq ($(UNAME_S),Linux)
 		CPUINFO_CMD := cat /proc/cpuinfo
 	else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
--- a/bindings/java/build.gradle
+++ b/bindings/java/build.gradle
@ -2,6 +2,7 @@ plugins {
    id 'java'
    id 'java-library'
    id 'maven-publish'
    id 'signing'
 }
 archivesBaseName = 'whispercpp'
@ -109,4 +110,23 @@ publishing {
            }
        }
    }
    repositories {
        maven {
            def releasesRepoUrl = 'https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/'
            def snapshotsRepoUrl = 'https://s01.oss.sonatype.org/content/repositories/snapshots/'
            url = version.endsWith('-SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl
            credentials {
                username = System.getenv("MAVEN_USERNAME")
                password = System.getenv("MAVEN_PASSWORD")
            }
        }
    }
 }
 signing {
    def signingKey = System.getenv("PGP_SECRET")
    def signingPassword = System.getenv("PGP_PASSPHRASE")
    useInMemoryPgpKeys(signingKey, signingPassword)
    sign publishing.publications.mavenJava
 }
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -6,8 +6,8 @@
 // ref: https://github.com/ggerganov/whisper.cpp/issues/171
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "common.h"
 #include "whisper.h"
 #include <sstream>
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
    return true;
 }
-void sam_print_usage(int argc, char ** argv, const sam_params & params) {
+void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -324,7 +324,7 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
    commandset_list.push_back(cs);
    return json{{"index",index}};
 }
-json seek(struct whisper_context * ctx, audio_async &audio, json params) {
+json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
    // whisper_state has the pertinent offsets, but there also seem to be a large
    // number of scratch buffers that would prevent rewinding context in a manner similar to llama
    // I'll give this a another pass once everything else is implemented,
@ -412,7 +412,7 @@ void process_loop(struct whisper_context * ctx, audio_async &audio, const whispe
            jobqueue.pop_front();
            // send response
            std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
-            fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
+            fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
            std::cout.flush();
        }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
    return speaker;
 }
-void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
+void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
    if (progress >= *progress_prev + progress_step) {
@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }
-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
    std::ofstream fout(fname);
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -3,8 +3,8 @@
 // A very quick-n-dirty implementation serving mainly as a proof of concept.
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "common.h"
 #include "whisper.h"
 #include <cassert>
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@ -1,11 +1,3 @@
 // Defines fileno on msys:
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #endif
 #include "llama-util.h"
 #include "llama.h"
@ -1164,7 +1156,7 @@ static bool llama_eval_internal(
    const llama_token * tokens,
            const int   n_tokens,
            const int   n_past,
-            const int   n_threads) {
+                  int   n_threads) {
    // enforce that the first token is BOS
    if (n_past == 0 && tokens[0] != llama_token_bos()) {
@ -1190,6 +1182,8 @@ static bool llama_eval_internal(
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_embd/hparams.n_head;
    const float eps = 5e-6f; // TODO: take from hparams
    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;
@ -1204,7 +1198,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    ggml_set_name(embd, "embd");
@ -1221,7 +1215,7 @@ static bool llama_eval_internal(
        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, eps);
            // cur = cur*attention_norm(broadcasted)
            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
@ -1329,7 +1323,7 @@ static bool llama_eval_internal(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, eps);
                // cur = cur*ffn_norm(broadcasted)
                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
@ -1367,7 +1361,7 @@ static bool llama_eval_internal(
    // norm
    {
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, eps);
        // inpL = inpL*norm(broadcasted)
        inpL = ggml_mul(ctx0, inpL, model.norm);
@ -1385,7 +1379,7 @@ static bool llama_eval_internal(
    // run the computation
    ggml_build_forward_expand  (&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 #ifdef GGML_PERF
    // print timing information per ggml operation (for debugging purposes)
@ -2488,8 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
            }
            struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
+            ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);
            ggml_graph_compute(lora_ctx, &gf);
            // we won't need these tensors again, reset the context to save memory
            ggml_free(lora_ctx);
@ -2635,7 +2628,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
            gf.n_threads = 1;
            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kout3d->data = out;
@ -2655,7 +2647,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
            ggml_free(cpy_ctx);
        }
@ -2743,7 +2735,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
            gf.n_threads = 1;
            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kin3d->data = (void *) inp;
@ -2763,7 +2754,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
            ggml_free(cpy_ctx);
        }
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -1,8 +1,8 @@
 // Talk with AI
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "common.h"
 #include "whisper.h"
 #include "llama.h"
@ -649,7 +649,10 @@ int main(int argc, char ** argv) {
                }
                text_to_speak = ::replace(text_to_speak, "\"", "");
-                system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
                if (ret != 0) {
                    fprintf(stderr, "%s: failed to speak\n", __func__);
                }
                audio.clear();
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    // create the ggml context
    {
        struct ggml_init_params params = {
-            .mem_size   = ctx_size,
+            /*.mem_size   =*/ ctx_size,
-            .mem_buffer = NULL,
+            /*.mem_buffer =*/ NULL,
-            .no_alloc   = false,
+            /*.no_alloc   =*/ false,
        };
        model.ctx = ggml_init(params);
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -1,8 +1,8 @@
 // Talk with AI
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "common.h"
 #include "whisper.h"
 #include "gpt-2.h"
@ -349,7 +349,10 @@ int main(int argc, char ** argv) {
                gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
                text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
-                system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
                if (ret != 0) {
                    fprintf(stderr, "%s: system() failed!\n", __func__);
                }
                audio.clear();
--- a/examples/whisper.android/.idea/compiler.xml
+++ b/examples/whisper.android/.idea/compiler.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="CompilerConfiguration">
-    <bytecodeTargetLevel target="11" />
+    <bytecodeTargetLevel target="17" />
  </component>
 </project>
--- a/examples/whisper.android/.idea/gradle.xml
+++ b/examples/whisper.android/.idea/gradle.xml
@ -4,15 +4,15 @@
  <component name="GradleSettings">
    <option name="linkedExternalProjectsSettings">
      <GradleProjectSettings>
        <option name="testRunner" value="GRADLE" />
        <option name="distributionType" value="DEFAULT_WRAPPED" />
        <option name="externalProjectPath" value="$PROJECT_DIR$" />
        <option name="gradleJvm" value="#GRADLE_LOCAL_JAVA_HOME" />
        <option name="modules">
          <set>
            <option value="$PROJECT_DIR$" />
            <option value="$PROJECT_DIR$/app" />
          </set>
        </option>
        <option name="resolveExternalAnnotations" value="false" />
      </GradleProjectSettings>
    </option>
  </component>
--- a/examples/whisper.android/.idea/misc.xml
+++ b/examples/whisper.android/.idea/misc.xml
@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ExternalStorageConfigurationManager" enabled="true" />
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="Android Studio default JDK" project-jdk-type="JavaSDK">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="jbr-17" project-jdk-type="JavaSDK">
    <output url="file://$PROJECT_DIR$/build/classes" />
  </component>
  <component name="ProjectType">
--- a/examples/whisper.android/app/build.gradle
+++ b/examples/whisper.android/app/build.gradle
@ -5,12 +5,12 @@ plugins {
 android {
    namespace 'com.whispercppdemo'
-    compileSdk 33
+    compileSdk 34
    defaultConfig {
        applicationId "com.whispercppdemo"
        minSdk 26
-        targetSdk 32
+        targetSdk 34
        versionCode 1
        versionName "1.0"
@ -31,19 +31,19 @@ android {
        }
    }
    compileOptions {
-        sourceCompatibility JavaVersion.VERSION_1_8
+        sourceCompatibility JavaVersion.VERSION_17
-        targetCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_17
    }
    kotlinOptions {
-        jvmTarget = '1.8'
+        jvmTarget = '17'
    }
    buildFeatures {
        compose true
    }
    composeOptions {
-        kotlinCompilerExtensionVersion '1.3.1'
+        kotlinCompilerExtensionVersion '1.5.0'
    }
-    ndkVersion "25.1.8937393"
+    ndkVersion "25.2.9519653"
    externalNativeBuild {
        cmake {
            path = file("src/main/jni/whisper/CMakeLists.txt")
@ -57,19 +57,19 @@ android {
 }
 dependencies {
-    implementation 'androidx.activity:activity-compose:1.6.1'
+    implementation 'androidx.activity:activity-compose:1.7.2'
-    implementation 'androidx.compose.material:material-icons-core:1.3.1'
+    implementation 'androidx.compose.material:material-icons-core:1.5.0'
-    implementation 'androidx.compose.material3:material3:1.0.1'
+    implementation 'androidx.compose.material3:material3:1.1.1'
-    implementation "androidx.compose.ui:ui:1.3.2"
+    implementation "androidx.compose.ui:ui:1.5.0"
-    implementation "androidx.compose.ui:ui-tooling-preview:1.3.2"
+    implementation "androidx.compose.ui:ui-tooling-preview:1.5.0"
-    implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.5.1'
+    implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.6.1'
    implementation "com.google.accompanist:accompanist-permissions:0.28.0"
-    implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.6.4'
+    implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.2'
    testImplementation 'junit:junit:4.13.2'
-    androidTestImplementation 'androidx.test.ext:junit:1.1.4'
+    androidTestImplementation 'androidx.test.ext:junit:1.1.5'
-    androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.0'
+    androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1'
-    androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.3.2"
+    androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.5.0"
-    debugImplementation "androidx.compose.ui:ui-tooling:1.3.2"
+    debugImplementation "androidx.compose.ui:ui-tooling:1.5.0"
-    debugImplementation "androidx.compose.ui:ui-test-manifest:1.3.2"
+    debugImplementation "androidx.compose.ui:ui-test-manifest:1.5.0"
 }
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -66,7 +66,7 @@ private fun MainScreen(
@Composable
 private fun MessageLog(log: String) {
-    SelectionContainer() {
+    SelectionContainer {
        Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
    }
 }
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -47,7 +47,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
    }
    private suspend fun printSystemInfo() {
-        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
+        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()))
    }
    private suspend fun loadData() {
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Theme.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Theme.kt
@ -13,7 +13,7 @@ import androidx.compose.runtime.SideEffect
 import androidx.compose.ui.graphics.toArgb
 import androidx.compose.ui.platform.LocalContext
 import androidx.compose.ui.platform.LocalView
-import androidx.core.view.ViewCompat
+import androidx.core.view.WindowCompat
 private val DarkColorScheme = darkColorScheme(
    primary = Purple80,
@ -55,8 +55,9 @@ fun WhisperCppDemoTheme(
    val view = LocalView.current
    if (!view.isInEditMode) {
        SideEffect {
-            (view.context as Activity).window.statusBarColor = colorScheme.primary.toArgb()
+            val window = (view.context as Activity).window
-            ViewCompat.getWindowInsetsController(view)?.isAppearanceLightStatusBars = darkTheme
+            window.statusBarColor = colorScheme.primary.toArgb()
            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
        }
    }
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -18,7 +18,9 @@ class WhisperContext private constructor(private var ptr: Long) {
    suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) {
        require(ptr != 0L)
-        WhisperLib.fullTranscribe(ptr, data)
+        val numThreads = WhisperCpuConfig.preferredThreadCount
        Log.d(LOG_TAG, "Selecting $numThreads threads")
        WhisperLib.fullTranscribe(ptr, numThreads, data)
        val textCount = WhisperLib.getTextSegmentCount(ptr)
        return@withContext buildString {
            for (i in 0 until textCount) {
@ -126,7 +128,7 @@ private class WhisperLib {
        external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
        external fun initContext(modelPath: String): Long
        external fun freeContext(contextPtr: Long)
-        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
+        external fun fullTranscribe(contextPtr: Long, numThreads: Int, audioData: FloatArray)
        external fun getTextSegmentCount(contextPtr: Long): Int
        external fun getTextSegment(contextPtr: Long, index: Int): String
        external fun getSystemInfo(): String
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/WhisperCpuConfig.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/WhisperCpuConfig.kt
@ -0,0 +1,73 @@
 package com.whispercppdemo.whisper
 import android.util.Log
 import java.io.BufferedReader
 import java.io.FileReader
 object WhisperCpuConfig {
    val preferredThreadCount: Int
        // Always use at least 2 threads:
        get() = CpuInfo.getHighPerfCpuCount().coerceAtLeast(2)
 }
 private class CpuInfo(private val lines: List<String>) {
    private fun getHighPerfCpuCount(): Int = try {
        getHighPerfCpuCountByFrequencies()
    } catch (e: Exception) {
        Log.d(LOG_TAG, "Couldn't read CPU frequencies", e)
        getHighPerfCpuCountByVariant()
    }
    private fun getHighPerfCpuCountByFrequencies(): Int =
        getCpuValues(property = "processor") { getMaxCpuFrequency(it.toInt()) }
            .also { Log.d(LOG_TAG, "Binned cpu frequencies (frequency, count): ${it.binnedValues()}") }
            .countDroppingMin()
    private fun getHighPerfCpuCountByVariant(): Int =
        getCpuValues(property = "CPU variant") { it.substringAfter("0x").toInt(radix = 16) }
            .also { Log.d(LOG_TAG, "Binned cpu variants (variant, count): ${it.binnedValues()}") }
            .countKeepingMin()
    private fun List<Int>.binnedValues() = groupingBy { it }.eachCount()
    private fun getCpuValues(property: String, mapper: (String) -> Int) = lines
        .asSequence()
        .filter { it.startsWith(property) }
        .map { mapper(it.substringAfter(':').trim()) }
        .sorted()
        .toList()
    private fun List<Int>.countDroppingMin(): Int {
        val min = min()
        return count { it > min }
    }
    private fun List<Int>.countKeepingMin(): Int {
        val min = min()
        return count { it == min }
    }
    companion object {
        private const val LOG_TAG = "WhisperCpuConfig"
        fun getHighPerfCpuCount(): Int = try {
            readCpuInfo().getHighPerfCpuCount()
        } catch (e: Exception) {
            Log.d(LOG_TAG, "Couldn't read CPU info", e)
            // Our best guess -- just return the # of CPUs minus 4.
            (Runtime.getRuntime().availableProcessors() - 4).coerceAtLeast(0)
        }
        private fun readCpuInfo() = CpuInfo(
            BufferedReader(FileReader("/proc/cpuinfo"))
                .useLines { it.toList() }
        )
        private fun getMaxCpuFrequency(cpuIndex: Int): Int {
            val path = "/sys/devices/system/cpu/cpu${cpuIndex}/cpufreq/cpuinfo_max_freq"
            val maxFreq = BufferedReader(FileReader(path)).use { it.readLine() }
            return maxFreq.toInt()
        }
    }
 }
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -163,16 +163,12 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
 JNIEXPORT void JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
-        JNIEnv *env, jobject thiz, jlong context_ptr, jfloatArray audio_data) {
+        JNIEnv *env, jobject thiz, jlong context_ptr, jint num_threads, jfloatArray audio_data) {
    UNUSED(thiz);
    struct whisper_context *context = (struct whisper_context *) context_ptr;
    jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL);
    const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data);
    // Leave 2 processors free (i.e. the high-efficiency cores).
    int max_threads = max(1, min(8, get_nprocs() - 2));
    LOGI("Selecting %d threads", max_threads);
    // The below adapted from the Objective-C iOS sample
    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    params.print_realtime = true;
@ -181,7 +177,7 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
    params.print_special = false;
    params.translate = false;
    params.language = "en";
-    params.n_threads = max_threads;
+    params.n_threads = num_threads;
    params.offset_ms = 0;
    params.no_context = true;
    params.single_segment = false;
--- a/examples/whisper.android/app/src/main/res/values/colors.xml
+++ b/examples/whisper.android/app/src/main/res/values/colors.xml
@ -1,10 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <resources>
    <color name="purple_200">#FFBB86FC</color>
    <color name="purple_500">#FF6200EE</color>
    <color name="purple_700">#FF3700B3</color>
    <color name="teal_200">#FF03DAC5</color>
    <color name="teal_700">#FF018786</color>
    <color name="black">#FF000000</color>
    <color name="white">#FFFFFFFF</color>
 </resources>
--- a/examples/whisper.android/build.gradle
+++ b/examples/whisper.android/build.gradle
@ -1,6 +1,6 @@
 // Top-level build file where you can add configuration options common to all sub-projects/modules.
 plugins {
-    id 'com.android.application' version '7.3.1' apply false
+    id 'com.android.application' version '8.1.1' apply false
-    id 'com.android.library' version '7.3.1' apply false
+    id 'com.android.library' version '8.1.1' apply false
-    id 'org.jetbrains.kotlin.android' version '1.7.10' apply false
+    id 'org.jetbrains.kotlin.android' version '1.9.0' apply false
 }
--- a/examples/whisper.android/gradle/wrapper/gradle-wrapper.properties
+++ b/examples/whisper.android/gradle/wrapper/gradle-wrapper.properties
@ -1,6 +1,6 @@
 #Wed Dec 14 10:37:24 EST 2022
 distributionBase=GRADLE_USER_HOME
-distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
 distributionPath=wrapper/dists
 zipStorePath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -4086,7 +4086,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }
-static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
                                    const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;
@ -4098,8 +4099,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
    const int i = row*ncols + col;
    const float col_theta_scale = powf(theta_scale, col);
    const float p = p0 + p_delta*(row/p_delta_rows);
-    const float theta = p*col_theta_scale;
+    const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);
@ -4109,7 +4111,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-    const float block_theta = block_p*col_theta_scale;
+    const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
    const float sin_block_theta = sinf(block_theta);
    const float cos_block_theta = cosf(block_theta);
@ -4984,12 +4986,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
    rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
 }
-static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-    GGML_ASSERT(nrows % 4 == 0);
+                              const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
-    const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    GGML_ASSERT(ncols % 4 == 0);
-    const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
    const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
 }
 static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@ -5723,22 +5726,18 @@ inline void ggml_cuda_op_rope(
    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    // compute
    if (is_glm) {
-        const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
+        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, n_ctx, cudaStream_main);
        const float id_p = min(p, n_ctx - 2.f);
        const float block_p = max(p - (n_ctx - 2.f), 0.f);
        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
    } else if (is_neox) {
        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
        rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
    } else {
        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
    }
@ -6400,10 +6399,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-    const int mode = ((int32_t *) dst->op_params)[2];
+    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, true);
    const bool is_glm = mode & 4;
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
 }
 void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -327,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
 void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
-    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
    if (result != 0) {
        metal_printf("%s: error: posix_memalign failed\n", __func__);
        return NULL;
@ -401,7 +401,7 @@ bool ggml_metal_add_buffer(
            }
        }
-        const size_t size_page = getpagesize();
+        const size_t size_page = sysconf(_SC_PAGESIZE);
        size_t size_aligned = size;
        if ((size_aligned % size_page) != 0) {
@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -220,14 +220,10 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    //// broadcast
+    const float mean  = sum[0] / ne00;
    //if (tpitg == 0) {
    //    sum[0] /= ne00;
    //}
    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float mean  = sum[0];
    // recenter and VARIANCE
    threadgroup_barrier(mem_flags::mem_threadgroup);
    device float * y = dst + tgpig*ne00;
    sum[tpitg] = 0.0f;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -235,12 +231,6 @@ kernel void kernel_norm(
        sum[tpitg] += y[i00] * y[i00];
    }
    //// VARIANCE
    //// parallel sum
    //sum[tpitg] = 0.0f;
    //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
    //    sum[tpitg] += y[i00] * y[i00];
    //}
    // reduce
    threadgroup_barrier(mem_flags::mem_threadgroup);
    for (uint i = ntg/2; i > 0; i /= 2) {
@ -249,12 +239,7 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    //// broadcast
+    const float variance = sum[0] / ne00;
    //if (tpitg == 0) {
    //    sum[0] /= ne00;
    //}
    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float variance = sum[0];
    const float scale = 1.0f/sqrt(variance + eps);
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -262,7 +247,6 @@ kernel void kernel_norm(
    }
 }
 kernel void kernel_rms_norm(
        device const  void * src0,
        device       float * dst,
@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
            }
        }
    }
 }
 kernel void kernel_alibi_f32(
@ -699,25 +682,27 @@ kernel void kernel_rope(
        constant       int & mode,
        constant     float & freq_base,
        constant     float & freq_scale,
-        uint3 tpig[[thread_position_in_grid]]) {
+        uint  tiitg[[thread_index_in_threadgroup]],
-    const int64_t i3 = tpig[2];
+        uint3 tptg[[threads_per_threadgroup]],
-    const int64_t i2 = tpig[1];
+        uint3 tgpig[[threadgroup_position_in_grid]]) {
-    const int64_t i1 = tpig[0];
+    const int64_t i3 = tgpig[2];
    const int64_t i2 = tgpig[1];
    const int64_t i1 = tgpig[0];
    const bool is_neox = mode & 2;
    const float theta_scale = pow(freq_base, -2.0f/n_dims);
    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
-    float theta = freq_scale * (float)p;
+    const float theta_0 = freq_scale * (float)p;
    const float inv_ndims = -1.f/n_dims;
    if (!is_neox) {
-        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
            const float cos_theta = cos(theta);
            const float sin_theta = sin(theta);
            theta *= theta_scale;
            device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@ -729,12 +714,12 @@ kernel void kernel_rope(
        }
    } else {
        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
                const float cos_theta = cos(theta);
                const float sin_theta = sin(theta);
                theta *= theta_scale;
                const int64_t i0 = ib*n_dims + ic/2;
                device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
--- a/ggml.c
+++ b/ggml.c
@ -1,4 +1,3 @@
 #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #include "ggml.h"
@ -107,6 +106,9 @@ typedef void * thread_ret_t;
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@ -196,9 +198,15 @@ typedef void * thread_ret_t;
 #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
 #else
 inline static void * ggml_aligned_malloc(size_t size) {
    if (size == 0) {
        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
        return NULL;
    }
    void * aligned_memory = NULL;
-#ifdef GGML_USE_METAL
+#ifdef GGML_USE_CPU_HBM
-    int result = posix_memalign(&aligned_memory, getpagesize(), size);
+    int result = hbw_posix_memalign(&aligned_memory, 16, size);
 #elif GGML_USE_METAL
    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
 #else
    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
 #endif
@ -219,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
    return aligned_memory;
 }
 #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
 #ifdef GGML_USE_CPU_HBM
 #define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
 #else
 #define GGML_ALIGNED_FREE(ptr)    free(ptr)
 #endif
 #endif
 #define UNUSED GGML_UNUSED
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@ -4572,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        return NULL;
    }
    // allow to call ggml_init with 0 size
    if (params.mem_size == 0) {
        params.mem_size = GGML_MEM_ALIGN;
    }
    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
    *ctx = (struct ggml_context) {
@ -4774,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
    size_t obj_alloc_size = 0;
-    if (view_src == NULL && ctx->no_alloc == false) {
+    if (view_src == NULL && !ctx->no_alloc) {
        if (ctx->scratch.data != NULL) {
            // allocate tensor data in the scratch buffer
            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@ -5475,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
    }
    if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
    }
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -5518,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
    }
    if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
    }
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -17266,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        } else {
            // wait for other threads to finish
            const int last = node_n;
-            do {
+            while (true) {
-                //sched_yield();
+                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
                //       depending on the workload and the operating system.
                //       since it is not clear what is the best approach, it should potentially become user-configurable
                //       ref: https://github.com/ggerganov/ggml/issues/291
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                sched_yield();
 #endif
                node_n = atomic_load(&state->shared->node_n);
-            } while (node_n == last);
+                if (node_n != last) break;
            };
        }
        // check if we should stop
@ -19962,7 +19987,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        struct ggml_tensor * data = NULL;
-        if (params.no_alloc == false) {
+        if (!params.no_alloc) {
            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
            ok = ok && data != NULL;
@ -20003,7 +20028,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            }
            // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (params.no_alloc == false) {
+            if (!params.no_alloc) {
              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
            }
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -22,7 +22,28 @@ function get_script_path() {
 models_path="$(get_script_path)"
 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small.en-tdrz" "small" "medium.en" "medium" "large-v1" "large" )
+models=( 
    "tiny.en"
    "tiny"
    "tiny-q5_1"
    "tiny.en-q5_1"
    "base.en"
    "base"
    "base-q5_1"
    "base.en-q5_1"
    "small.en"
    "small.en-tdrz"
    "small"
    "small-q5_1"
    "small.en-q5_1"
    "medium"
    "medium.en"
    "medium-q5_0"
    "medium.en-q5_0"
    "large-v1"
    "large"
    "large-q5_0"
 )
 # list available models
 function list_models {
--- a/whisper.cpp
+++ b/whisper.cpp
@ -18,6 +18,7 @@
 #include <cstring>
 #include <fstream>
 #include <map>
 #include <set>
 #include <string>
 #include <thread>
 #include <vector>
@ -117,6 +118,21 @@ static void byteswap_tensor(ggml_tensor * tensor) {
 #define WHISPER_USE_SCRATCH
 #define WHISPER_MAX_SCRATCH_BUFFERS 16
 //
 // ggml helpers
 //
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
    if (plan.work_size > 0) {
        buf.resize(plan.work_size);
        plan.work_data = buf.data();
    }
    ggml_graph_compute(graph, &plan);
 }
 // available whisper models
 enum e_model {
    MODEL_UNKNOWN,
@ -537,6 +553,7 @@ struct whisper_kv_cache {
    struct ggml_context * ctx;
    // buf points to the memory allocated for both ggml_tensor 'k' and 'v' (see kv_cache_init)
    std::vector<uint8_t> buf;
    int n; // number of tokens currently in the cache
@ -602,7 +619,7 @@ struct whisper_sequence {
 // TAGS: WHISPER_DECODER_INIT
 struct whisper_decoder {
-    // each decoders keeps its own KV-cache
+    // each decoder keeps its own KV-cache
    whisper_kv_cache kv_self;
    // the currently generated sequence of tokens
@ -622,6 +639,24 @@ struct whisper_decoder {
    std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
 };
 // replace std::pair by using customized pair struct (reason: std::pair is very slow)
 template<typename A, typename B>
 struct whisper_pair {
    A first;
    B second;
    // Define a constructor that takes two arguments.
    whisper_pair(const A& a, const B& b) : first(a), second(b) {}
    // Define a constructor that takes no argument.
    whisper_pair() : first(A()), second(B()) {}
 };
 // beam-search helpers
 struct kv_buf {
    std::vector<uint8_t> k;
    std::vector<uint8_t> v;
 };
 struct whisper_state {
    int64_t t_sample_us = 0;
    int64_t t_encode_us = 0;
@ -641,8 +676,12 @@ struct whisper_state {
    whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
    // buffer for swapping KV caches between decoders during beam-search
    std::vector<kv_buf> kv_swap_bufs;
    // memory buffers used by encode / decode contexts
    std::vector<uint8_t> buf_compute;
    std::vector<uint8_t> buf_work;
    std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
    int    buf_last = 0;
@ -655,7 +694,7 @@ struct whisper_state {
    std::vector<whisper_token>   prompt_past;
    // work container used to avoid memory allocations
-    std::vector<std::pair<double, whisper_vocab::id>> logits_id;
+    std::vector<whisper_pair<double, whisper_vocab::id>> logits_id;
    mutable std::mt19937 rng; // used for sampling at t > 0.0
@ -1808,7 +1847,7 @@ static bool whisper_encode_internal(
            struct ggml_cgraph gf = {};
            ggml_build_forward_expand(&gf, cur);
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
            //ggml_graph_print(&gf);
        }
@ -1893,7 +1932,7 @@ static bool whisper_encode_internal(
            ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
        }
-        ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+        ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
        //ggml_graph_print(&gf);
    }
@ -2307,7 +2346,7 @@ static bool whisper_decode_internal(
    // run the computation
    {
        ggml_build_forward_expand(&gf, logits);
-        ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+        ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
    }
    // extract logits for all N tokens
@ -3975,17 +4014,21 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
    auto & logits_id = state.logits_id;
-    logits_id.clear();
+    logits_id.resize(n_logits);
    for (int i = 0; i < n_logits; ++i) {
-        logits_id.push_back({ logits[i], i });
+        logits_id[i].first = logits[i];
        logits_id[i].second = i;
    }
    {
        using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
        std::partial_sort(
                logits_id.begin(),
                logits_id.begin() + k, logits_id.end(),
-            [](const std::pair<double, whisper_token> & a, const std::pair<double, whisper_token> & b) {
+                [](const pair_type & a, const pair_type & b) {
            return a.first > b.first;
        });
    }
    std::vector<whisper_token_data> result;
    result.reserve(k);
@ -4080,6 +4123,115 @@ static void whisper_sequence_score(
    }
 }
 static bool whisper_kv_swap_fast(
                   std::vector<int> & view,
                    whisper_decoder   src[],
                std::vector<kv_buf> & kv_swap_bufs,
                          const int & n_decoders) {
    WHISPER_PRINT_DEBUG("%s: n_decoders %d\n", __func__, n_decoders);
    // (decoder->buffer->decoder or decoder->buffer + decoder->decoder)
    std::set<int> two_copy; // decoder indices require two copies to safely modify KV caches
    // (buffer->decoder or decoder->decoder)
    std::set<int> one_copy; // decoder indices require one copy to safely modify KV caches
    // (decoder<->decoder)
    std::set<int> p_swap_set; // decoder indices able to swap KV-cache pointers
    std::vector<whisper_pair<int, int>> p_swap_vec;
    p_swap_vec.reserve(n_decoders);
    // see https://github.com/ggerganov/whisper.cpp/wiki
    for (int i = 0; i < n_decoders; i++) {
        // zero-copy (no modification)
        if (i == view[i] || view[i] < 0) {
            continue;
        }
        bool is_one_copy = true;
        // since we modify data sequentially, we only consider decoder indices after current index
        for (int j = i + 1; j < n_decoders; j++) {
            if (i == view[j]) {
                // detect symmetric diagram
                if (j == view[i]) {
                    p_swap_set.insert(i);
                    p_swap_set.insert(j);
                    p_swap_vec.emplace_back(i, j);
                } else {
                    two_copy.insert(i);
                    is_one_copy = false;
                }
                break;
            }
        }
        if (is_one_copy) {
            one_copy.insert(i);
        }
    }
    kv_swap_bufs.resize(n_decoders);
    for (int i = 0; i < n_decoders; i++) {
        kv_swap_bufs[i].k.resize(ggml_nbytes(src[i].kv_self.k));
        kv_swap_bufs[i].v.resize(ggml_nbytes(src[i].kv_self.v));
    }
    for (auto & i : two_copy) {
        // make a copy of KV caches
        WHISPER_PRINT_DEBUG("%s: store KV cache into swap: idx %d\n", __func__, i);
        memcpy(kv_swap_bufs[i].k.data(), src[i].kv_self.k->data, kv_swap_bufs[i].k.size());
        memcpy(kv_swap_bufs[i].v.data(), src[i].kv_self.v->data, kv_swap_bufs[i].v.size());
    }
    // since two-copy decoder KV caches are protected by kv_swap_bufs, modify them first
    for (auto & i : two_copy) {
        // skip the decoder indices that require pointer swapping
        if (p_swap_set.find(i) != p_swap_set.end()) {
            continue;
        }
        if (two_copy.find(view[i]) != two_copy.end()) {
            // modify KV caches of decoder using data from kv_swap_bufs
            WHISPER_PRINT_DEBUG("%s: two-copy decoder using   swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
            memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
            memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
        } else {
            // modify KV caches of decoder using data from correspond decoder KV caches directly
            WHISPER_PRINT_DEBUG("%s: two-copy decoder without swap buffers:      %d  -> %d\n", __func__, view[i], i);
            memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
            memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
        }
    }
    // then modify one-copy decoder KV caches
    for (auto & i : one_copy) {
        // skip the decoder indices that require pointer swapping
        if (p_swap_set.find(i) != p_swap_set.end()) {
            continue;
        }
        if (two_copy.find(view[i]) != two_copy.end()) {
            // modify KV caches of decoder using data from kv_swap_bufs
            WHISPER_PRINT_DEBUG("%s: one-copy decoder using   swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
            memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
            memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
        } else {
            // modify KV caches of decoder using data from correspond decoder KV caches directly
            WHISPER_PRINT_DEBUG("%s: one-copy decoder without swap buffers:      %d  -> %d\n", __func__, view[i], i);
            memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
            memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
        }
    }
    // swap the pointers
    for (auto & i : p_swap_vec) {
        WHISPER_PRINT_DEBUG("%s: swap pointers: %d <-> %d\n", __func__, i.first, i.second);
        std::swap(src[i.first].kv_self, src[i.second].kv_self);
    }
    return true;
 }
 int whisper_full_with_state(
        struct whisper_context * ctx,
          struct whisper_state * state,
@ -4243,14 +4395,6 @@ int whisper_full_with_state(
    std::vector<whisper_token> prompt;
    prompt.reserve(whisper_n_text_ctx(ctx));
    // beam-search helpers
    struct kv_buf {
        std::vector<uint8_t> k;
        std::vector<uint8_t> v;
    };
    std::vector<kv_buf> kv_bufs;
    struct beam_candidate {
        int decoder_idx;
        int seek_delta;
@ -4399,23 +4543,7 @@ int whisper_full_with_state(
            for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
                const int64_t t_start_sample_us = ggml_time_us();
                // store the KV caches of all decoders when doing beam-search
                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
                    kv_bufs.resize(n_decoders_cur);
                    for (int j = 0; j < n_decoders_cur; ++j) {
                        auto & decoder = state->decoders[j];
                        if (decoder.completed || decoder.failed) {
                            continue;
                        }
                        kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
                        kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
                        memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
                        memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
                    }
                    beam_candidates.clear();
                }
@ -4463,6 +4591,7 @@ int whisper_full_with_state(
                    });
                    uint32_t cur_c = 0;
                    std::vector<int> decoder_idx(n_decoders_cur, -1);
                    for (int j = 0; j < n_decoders_cur; ++j) {
                        auto & decoder = state->decoders[j];
@ -4481,12 +4610,13 @@ int whisper_full_with_state(
                        decoder.seek_delta = cur.seek_delta;
                        decoder.has_ts     = cur.has_ts;
-                        memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size());
+                        decoder_idx[j] = cur.decoder_idx;
                        memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size());
                        WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
                                __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
                    }
                    // update KV caches
                    whisper_kv_swap_fast(decoder_idx, state->decoders, state->kv_swap_bufs, n_decoders_cur);
                }
                // update the decoder state
@ -5111,7 +5241,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
    // b: N*N*sizeof(float)
    // c: N*N*sizeof(float)
    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
+    std::vector<uint8_t> buf (3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
    std::vector<uint8_t> work(1llu*N_max*N_max*sizeof(float) + 1*ggml_tensor_overhead());
    // put a bunch of random data in the buffer
    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@ -5166,12 +5297,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            double tsum = 0.0;
            // heat-up
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_helper(work, &gf, n_threads);
            for (int i = 0; i < n_max; ++i) {
                const int64_t t0 = ggml_time_us();
-                ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+                ggml_graph_compute_helper(work, &gf, n_threads);
                const int64_t t1 = ggml_time_us();
Author	SHA1	Message	Date
Georgi Gerganov	09a6325de5	ggml : use sched_yield when using BLAS + add comment	2023-09-12 13:33:09 +03:00
Georgi Gerganov	39c4fc59dd	whisper : fix bench regression	2023-09-12 11:21:02 +03:00
bobqianic	9b14418863	whisper : faster beam_search sampling via reduced KV cache copies (#1243 ) * Faster `beam_search` sampling Refine the KV cache update logic for more intelligent and efficient updating. * Faster `whisper_sample_token_topk` * Update whisper.cpp * Update whisper.cpp * Update whisper.cpp * Reduce `memory allocation` * Add `pointer swapping` * Fixed some bugs * Update whisper.cpp * Apply suggestions from code review * Updated the logic for determining `two-copy` * Updated the logic for determining `two-copy` v2 * whisper : add debug logs + coding style --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-09-10 16:04:27 +03:00
Nicholas Albion	6ddc727fac	java : fixed signing of java artifact using gradle (#1267 ) * --stacktrace signMavenJavaPublication * added temporary step "Debug gradle signing" * cd bindings/java * use GPG_PRIVATE_KEY and GPG_PASSPHRASE * use secrets.GPG_PRIVATE_KEY and GPG_PASSPHRASE	2023-09-09 18:55:51 +03:00
Georgi Gerganov	acb5278cc8	ci : try to fix gradle action (#1265 )	2023-09-08 20:50:15 +03:00
Georgi Gerganov	0839209cab	gitignore : update	2023-09-08 19:45:28 +03:00
Georgi Gerganov	b39809668a	sync : ggml (HBM + Metal + style) (#1264 )	2023-09-08 17:58:31 +03:00
Georgi Gerganov	3e9edc6845	ci : upgrade gradle to 2.4.2 (#1263 ) * ci : upgrade gradle to 2.4.2 * cmake : add comment (#1129)	2023-09-08 17:58:14 +03:00
Georgi Gerganov	bfc73f1fa2	sync : ggml (CUDA faster rope)	2023-09-08 15:01:26 +03:00
Georgi Gerganov	f00c9bba33	cmake : noramlize case (#1129 )	2023-09-08 14:50:03 +03:00
Przemysław Pawełczyk	b55b505690	build : do not use _GNU_SOURCE gratuitously (#1129 ) * Do not use _GNU_SOURCE gratuitously. What is needed to build whisper.cpp and examples is availability of stuff defined in The Open Group Base Specifications Issue 6 (https://pubs.opengroup.org/onlinepubs/009695399/) known also as Single Unix Specification v3 (SUSv3) or POSIX.1-2001 + XSI extensions, plus some stuff from BSD that is not specified in POSIX.1. Well, that was true until NUMA support was added recently in ggml, so enable GNU libc extensions for Linux builds to cover that. There is no need to penalize musl libc which simply follows standards. Not having feature test macros in source code gives greater flexibility to those wanting to reuse it in 3rd party app, as they can build it with minimal FTM (_XOPEN_SOURCE=600) or other FTM depending on their needs. It builds without issues in Alpine (musl libc), Ubuntu (glibc), MSYS2. * examples : include SDL headers before other headers Avoid macOS build error when _DARWIN_C_SOURCE is not defined, brought by SDL2 relying on Darwin extension memset_pattern4/8/16 (from string.h). * make : enable BSD extensions for DragonFlyBSD to expose RLIMIT_MEMLOCK * make : use BSD-specific FTMs to enable alloca on BSDs * make : fix OpenBSD build by exposing newer POSIX definitions * cmake : follow recent FTM improvements from Makefile	2023-09-07 12:36:14 +03:00
Georgi Gerganov	2818de21ff	examples : fix build + compile warnings (close #1256 )	2023-09-07 12:33:12 +03:00
Neil Chudleigh	aed5d40607	models : add quantum models to download-ggml-model.sh (#1235 ) * Add quantized models to download-ggml-model.sh * Update names in download-ggml-model script to normalized	2023-09-07 12:16:58 +03:00
Digipom	afa5477d1c	whisper.android : bump gradle plugin and dependencies + a lint pass (#1255 )	2023-09-07 12:15:59 +03:00
Nicholas Albion	01fcd42431	sign jar for Maven Central repo	2023-09-07 11:45:44 +10:00
Digipom	f990610776	whisper.android : address ARM's big.LITTLE arch by checking cpu info (#1254 ) Addresses https://github.com/ggerganov/whisper.cpp/issues/1248	2023-09-06 18:32:30 +03:00
Didzis Gosko	64cb45fd79	make : fix detection of AVX2 on macOS (#1250 )	2023-09-06 18:22:21 +03:00
Przemysław Pawełczyk	ace6c12ec6	ggml : posixify pagesize (#1251 ) * ggml : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml.c * metal : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml-metal.m	2023-09-06 18:19:36 +03:00
Nicholas Albion	cac75be05b	configured publishing.repositories	2023-09-06 13:13:36 +10:00