ggml : initial tests with libnvblas

Update README.md
livestream.sh : remove obsolete comment
2025-07-03 16:01:03 +02:00 · 2022-12-08 22:01:52 +02:00 · 2022-12-07 05:15:46 +02:00 · 2022-12-07 04:41:43 +02:00 · 2022-12-06 22:12:57 +02:00 · 2022-12-06 22:09:26 +02:00
3 changed files with 35 additions and 25 deletions
--- a/README.md
+++ b/README.md
@ -52,21 +52,6 @@ The tensor operators are optimized heavily for Apple silicon CPUs. Depending on
 instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
 the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
 ## Limitations
 - Inference only
 - No GPU support
 - Very basic greedy sampling scheme - always pick up the token with highest probability.
  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
  to run the python code with the following parameters:
  ```
  whisper --best_of None --beam_size None ...
  ```
  In the future, `whisper.cpp` will support more sampling strategies.
 ## Quick start
 First, download one of the Whisper models converted in [ggml format](models). For example:
@ -220,6 +205,21 @@ make large
 | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Limitations
 - Inference only
 - No GPU support
 - Very basic greedy sampling scheme - always pick up the token with highest probability.
  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
  to run the python code with the following parameters:
  ```
  whisper --best_of None --beam_size None ...
  ```
  In the future, `whisper.cpp` will support more sampling strategies.
 ## Another example
 Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -4,10 +4,6 @@ set -eo pipefail
 # Idea by @semiformal-net
 # ref: https://github.com/ggerganov/whisper.cpp/issues/185
 #
 # TODO:
 # - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
 #   way to produce a continuous stream of audio chunks.
 #
 url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
 fmt=aac # the audio format extension of the stream (TODO: auto detect)
--- a/ggml.c
+++ b/ggml.c
@ -96,6 +96,8 @@ typedef void* thread_ret_t;
 #include <Accelerate/Accelerate.h>
 #elif GGML_USE_OPENBLAS
 #include <cblas.h>
 // sgemm
 extern void sgemm_(char* transa, char* transb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb, float* beta, float* c, int* ldc);
 #endif
 // floating point type used to accumulate sums
@ -213,7 +215,7 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
 }
 #define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
-#define GGML_FP32_TO_TP16(x) ggml_fp32_to_fp16(x)
+#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
 #endif // __F16C__
@ -4588,11 +4590,23 @@ void ggml_compute_forward_mul_mat_f16_f32(
                // zT = y * xT
                {
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                    //cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                            ne11, ne01, ne10,
+                    //        ne11, ne01, ne10,
-                            1.0f,    y, ne10,
+                    //        1.0f,    y, ne10,
-                                     x, ne10,
+                    //                 x, ne10,
-                            0.0f,    d, ne01);
+                    //        0.0f,    d, ne01);
                    // this is compatible with nvblas
                    float one = 1.0f;
                    float zero = 0.0f;
                    sgemm_(
                            "T", "N",
                            &ne0, &ne1, &ne10,
                            &one,
                            x, &ne10,
                            y, &ne10,
                            &zero,
                            d, &ne0);
                }
            }
        }
Author	SHA1	Message	Date
Georgi Gerganov	683f111088	ggml : initial tests with libnvblas	2022-12-08 22:01:52 +02:00
Georgi Gerganov	3996ecc156	Update README.md	2022-12-07 05:15:46 +02:00
Georgi Gerganov	faa85f9840	livestream.sh : remove obsolete comment	2022-12-07 04:41:43 +02:00
Georgi Gerganov	b6597539f9	ggml : fix typo in previous commit	2022-12-06 22:12:57 +02:00
Georgi Gerganov	9a4b7a916e	ggml : use macros to inline FP16 <-> FP32 conversions	2022-12-06 22:09:26 +02:00