ggml : use macros to inline FP16 <-> FP32 conversions

2025-07-04 00:11:12 +02:00 · 2022-12-06 22:05:33 +02:00
3 changed files with 25 additions and 35 deletions
--- a/README.md
+++ b/README.md
@ -52,6 +52,21 @@ The tensor operators are optimized heavily for Apple silicon CPUs. Depending on
 instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
 the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
 ## Limitations
 - Inference only
 - No GPU support
 - Very basic greedy sampling scheme - always pick up the token with highest probability.
  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
  to run the python code with the following parameters:
  ```
  whisper --best_of None --beam_size None ...
  ```
  In the future, `whisper.cpp` will support more sampling strategies.
 ## Quick start
 First, download one of the Whisper models converted in [ggml format](models). For example:
@ -205,21 +220,6 @@ make large
 | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Limitations
 - Inference only
 - No GPU support
 - Very basic greedy sampling scheme - always pick up the token with highest probability.
  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
  to run the python code with the following parameters:
  ```
  whisper --best_of None --beam_size None ...
  ```
  In the future, `whisper.cpp` will support more sampling strategies.
 ## Another example
 Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -4,6 +4,10 @@ set -eo pipefail
 # Idea by @semiformal-net
 # ref: https://github.com/ggerganov/whisper.cpp/issues/185
 #
 # TODO:
 # - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
 #   way to produce a continuous stream of audio chunks.
 #
 url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
 fmt=aac # the audio format extension of the stream (TODO: auto detect)
--- a/ggml.c
+++ b/ggml.c
@ -96,8 +96,6 @@ typedef void* thread_ret_t;
 #include <Accelerate/Accelerate.h>
 #elif GGML_USE_OPENBLAS
 #include <cblas.h>
 // sgemm
 extern void sgemm_(char* transa, char* transb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb, float* beta, float* c, int* ldc);
 #endif
 // floating point type used to accumulate sums
@ -215,7 +213,7 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
 }
 #define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
+#define GGML_FP32_TO_TP16(x) ggml_fp32_to_fp16(x)
 #endif // __F16C__
@ -4590,23 +4588,11 @@ void ggml_compute_forward_mul_mat_f16_f32(
                // zT = y * xT
                {
-                    //cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                    //        ne11, ne01, ne10,
+                            ne11, ne01, ne10,
-                    //        1.0f,    y, ne10,
+                            1.0f,    y, ne10,
-                    //                 x, ne10,
+                                     x, ne10,
-                    //        0.0f,    d, ne01);
+                            0.0f,    d, ne01);
                    // this is compatible with nvblas
                    float one = 1.0f;
                    float zero = 0.0f;
                    sgemm_(
                            "T", "N",
                            &ne0, &ne1, &ne10,
                            &one,
                            x, &ne10,
                            y, &ne10,
                            &zero,
                            d, &ne0);
                }
            }
        }