Merge branch 'master' into metal-and-alloc

2025-08-14 17:08:41 +02:00 · 2023-09-12 14:02:43 +03:00
parent d3b2dd4955 3fec2119e6
commit de94c783ee
2 changed files with 15 additions and 6 deletions
--- a/ggml.c
+++ b/ggml.c
@ -17283,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        } else {
            // wait for other threads to finish
            const int last = node_n;
-            do {
+            while (true) {
-                //sched_yield();
+                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
                //       depending on the workload and the operating system.
                //       since it is not clear what is the best approach, it should potentially become user-configurable
                //       ref: https://github.com/ggerganov/ggml/issues/291
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                sched_yield();
 #endif
                node_n = atomic_load(&state->shared->node_n);
-            } while (node_n == last);
+                if (node_n != last) break;
            };
        }
        // check if we should stop
--- a/whisper.cpp
+++ b/whisper.cpp
@ -5332,7 +5332,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
    // b: N*N*sizeof(float)
    // c: N*N*sizeof(float)
    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
+    std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
    std::vector<uint8_t> work;
    // put a bunch of random data in the buffer
    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@ -5387,12 +5388,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            double tsum = 0.0;
            // heat-up
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_helper(work, &gf, n_threads);
            for (int i = 0; i < n_max; ++i) {
                const int64_t t0 = ggml_time_us();
-                ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+                ggml_graph_compute_helper(work, &gf, n_threads);
                const int64_t t1 = ggml_time_us();