ggml : initial tests with libnvblas

Update README.md
livestream.sh : remove obsolete comment
2025-07-07 20:57:10 +02:00 · 2022-12-08 22:01:52 +02:00 · 2022-12-07 05:15:46 +02:00 · 2022-12-07 04:41:43 +02:00 · 2022-12-06 22:12:57 +02:00 · 2022-12-06 22:09:26 +02:00
3 changed files with 88 additions and 67 deletions
--- a/README.md
+++ b/README.md
@ -52,21 +52,6 @@ The tensor operators are optimized heavily for Apple silicon CPUs. Depending on
 instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
 the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.

-## Limitations
-
- Inference only
- No GPU support
- Very basic greedy sampling scheme - always pick up the token with highest probability.
-  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
-  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
-  to run the python code with the following parameters:
-
-  ```
-  whisper --best_of None --beam_size None ...
-  ```
-
-  In the future, `whisper.cpp` will support more sampling strategies.
-
 ## Quick start

 First, download one of the Whisper models converted in [ggml format](models). For example:
@ -220,6 +205,21 @@ make large
 | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |

+## Limitations
+
+- Inference only
+- No GPU support
+- Very basic greedy sampling scheme - always pick up the token with highest probability.
+  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
+  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
+  to run the python code with the following parameters:
+
+  ```
+  whisper --best_of None --beam_size None ...
+  ```
+
+  In the future, `whisper.cpp` will support more sampling strategies.
+
 ## Another example

 Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -4,10 +4,6 @@ set -eo pipefail
 # Idea by @semiformal-net
 # ref: https://github.com/ggerganov/whisper.cpp/issues/185
 #
-# TODO:
-# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
-#   way to produce a continuous stream of audio chunks.
-#

 url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
 fmt=aac # the audio format extension of the stream (TODO: auto detect)
--- a/ggml.c
+++ b/ggml.c
@ -96,6 +96,8 @@ typedef void* thread_ret_t;
 #include <Accelerate/Accelerate.h>
 #elif GGML_USE_OPENBLAS
 #include <cblas.h>
+// sgemm
+extern void sgemm_(char* transa, char* transb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb, float* beta, float* c, int* ldc);
 #endif

 // floating point type used to accumulate sums
@ -120,6 +122,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
    return x;
 }

+#define GGML_FP16_TO_FP32(x) (x)
+#define GGML_FP32_TO_FP16(x) (x)
+
 #else

 #ifdef __wasm_simd128__
@ -139,6 +144,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
    return _cvtss_sh(f, 0);
 }

+#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+
 #else

 static inline float fp32_from_bits(uint32_t w) {
@ -205,8 +213,13 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
-#endif
-#endif
+
+#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON

 //
 // global data
@ -589,7 +602,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t

    // leftovers
    for (int i = n32; i < n; ++i) {
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #elif defined(__AVX2__)
    // AVX 256-bit
@ -633,7 +646,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
    // leftovers
    for (int i = n32; i < n; ++i) {
        //GGML_ASSERT(false);
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #elif defined(__AVX__)
    // AVX 256-bit
@ -677,7 +690,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
    // leftovers
    for (int i = n32; i < n; ++i) {
        //GGML_ASSERT(false);
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #elif defined(__wasm_simd128__)
    // WASM 128-bit
@ -696,8 +709,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t

    for (int i = 0; i < n16; i += 16) {
        for (int k = 0; k < 16; ++k) {
-            tx[k] = ggml_fp16_to_fp32(x[i + k]);
-            ty[k] = ggml_fp16_to_fp32(y[i + k]);
+            tx[k] = GGML_FP16_TO_FP32(x[i + k]);
+            ty[k] = GGML_FP16_TO_FP32(y[i + k]);
        }

        x0 = wasm_v128_load(tx + 0);
@ -725,11 +738,11 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
    // leftovers
    for (int i = n16; i < n; ++i) {
        //GGML_ASSERT(false);
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #else
    for (int i = 0; i < n; ++i) {
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #endif

@ -966,7 +979,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
    // leftovers
    for (int i = n32; i < n; ++i) {
        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #elif defined(__AVX2__)
    // AVX 256-bit
@ -1002,7 +1015,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
    // leftovers
    for (int i = n32; i < n; ++i) {
        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #elif defined(__AVX__)
    // AVX 256-bit
@ -1038,7 +1051,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
    // leftovers
    for (int i = n32; i < n; ++i) {
        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #elif defined(__wasm_simd128__)
    // WASM SIMD 128-bit
@ -1054,8 +1067,8 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_

    for (int i = 0; i < n16; i += 16) {
        for (int k = 0; k < 16; ++k) {
-            tx[k] = ggml_fp16_to_fp32(x[i + k]);
-            ty[k] = ggml_fp16_to_fp32(y[i + k]);
+            tx[k] = GGML_FP16_TO_FP32(x[i + k]);
+            ty[k] = GGML_FP16_TO_FP32(y[i + k]);
        }

        x0 = wasm_v128_load(tx + 0);
@ -1079,18 +1092,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        wasm_v128_store(ty + 12, y3);

        for (int k = 0; k < 16; ++k) {
-            y[i + k] = ggml_fp32_to_fp16(ty[k]);
+            y[i + k] = GGML_FP32_TO_FP16(ty[k]);
        }
    }

    // leftovers
    for (int i = n16; i < n; ++i) {
        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #else
    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #endif
 }
@ -1122,9 +1135,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
    uint16_t t;
    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = ggml_fp16_to_fp32(table_gelu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
    }
 }
 #else
@ -1472,9 +1485,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        for (int i = 0; i < (1 << 16); ++i) {
            uint16_t ui = i;
            memcpy(&ii, &ui, sizeof(ii));
-            const float f = ggml_fp16_to_fp32(ii);
-            table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
-            table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
+            const float f = GGML_FP16_TO_FP32(ii);
+            table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+            table_exp_f16[i]  = GGML_FP32_TO_FP16(exp(f));
        }

        const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@ -1857,7 +1870,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
            } break;
        case GGML_TYPE_F32:
            {
@ -1893,7 +1906,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
@ -1927,7 +1940,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
            } break;
        case GGML_TYPE_F32:
            {
@ -1963,7 +1976,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
@ -3227,7 +3240,7 @@ void ggml_compute_forward_dup_f32(
                        for (int i00 = 0; i00 < ne00; i00++) {
                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

-                            dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
                            id++;
                        }
                    }
@ -3265,7 +3278,7 @@ void ggml_compute_forward_dup_f32(
                        for (int i00 = 0; i00 < ne00; i00++) {
                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

-                            dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
                            id++;
                        }
                    }
@ -4547,7 +4560,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
                    int id = 0;
                    for (int i01 = 0; i01 < ne01; ++i01) {
                        for (int i00 = 0; i00 < ne00; ++i00) {
-                            wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
+                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
                        }
                    }
                }
@ -4577,11 +4590,23 @@ void ggml_compute_forward_mul_mat_f16_f32(

                // zT = y * xT
                {
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                            ne11, ne01, ne10,
-                            1.0f,    y, ne10,
-                                     x, ne10,
-                            0.0f,    d, ne01);
+                    //cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                    //        ne11, ne01, ne10,
+                    //        1.0f,    y, ne10,
+                    //                 x, ne10,
+                    //        0.0f,    d, ne01);
+
+                    // this is compatible with nvblas
+                    float one = 1.0f;
+                    float zero = 0.0f;
+                    sgemm_(
+                            "T", "N",
+                            &ne0, &ne1, &ne10,
+                            &one,
+                            x, &ne10,
+                            y, &ne10,
+                            &zero,
+                            d, &ne0);
                }
            }
        }
@ -4601,7 +4626,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
                for (int i12 = 0; i12 < ne12; ++i12) {
                    for (int i11 = 0; i11 < ne11; ++i11) {
                        for (int i10 = 0; i10 < ne10; ++i10) {
-                            wdata[id++] = ggml_fp32_to_fp16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
+                            wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
                        }
                    }
                }
@ -4635,12 +4660,12 @@ void ggml_compute_forward_mul_mat_f16_f32(
        const int ic1 = MIN(ic0 + dc, ne);

        for (int i = ic0; i < ic1; ++i) {
-            ((float *) dst->data)[i] = ggml_fp16_to_fp32(wdata[i]);
+            ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
        }

        for (int k = 1; k < nth; k++) {
            for (int i = ic0; i < ic1; ++i) {
-                ((float *) dst->data)[i] += ggml_fp16_to_fp32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
+                ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
            }
        }

@ -4911,7 +4936,7 @@ void ggml_compute_forward_get_rows_f16(

        for (int j = 0; j < nc; ++j) {
            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
-            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = ggml_fp16_to_fp32(v);
+            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
        }
    }
 }
@ -5077,9 +5102,9 @@ void ggml_compute_forward_soft_max_f32(
                p[i] = 0.0;
            } else {
                //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
-                ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
+                ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
                memcpy(&ss, &s, sizeof(ss));
-                const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+                const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
                sum += val;
                p[i] = val;
            }
@ -5283,7 +5308,7 @@ void ggml_compute_forward_conv_1d_1s_f16_f32(
                const float * const src = (float *)((char *) src1->data + i11*nb11);
                ggml_fp16_t * dst_data = wdata;
                for (int i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
                }
            }
        }
@ -5549,7 +5574,7 @@ void ggml_compute_forward_conv_1d_2s_f16_f32(
                const float * const src = (float *)((char *) src1->data + i11*nb11);
                ggml_fp16_t * dst_data = wdata;
                for (int i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
                }
            }
        }
@ -5886,9 +5911,9 @@ void ggml_compute_forward_flash_attn_f32(
                    S[i] = 0.0;
                } else {
                    //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
-                    ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
                    memcpy(&ss, &s, sizeof(ss));
-                    const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
                    sum += val;
                    S[i] = val;
                }
@ -6067,9 +6092,9 @@ void ggml_compute_forward_flash_attn_f16(
                    S[i] = 0.0;
                } else {
                    //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
-                    ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
                    memcpy(&ss, &s, sizeof(ss));
-                    const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
                    sum += val;
                    S[i] = val;
                }
@ -6084,7 +6109,7 @@ void ggml_compute_forward_flash_attn_f16(
        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);

        for (int i = 0; i < M; i++) {
-            S16[i] = ggml_fp32_to_fp16(S[i]);
+            S16[i] = GGML_FP32_TO_FP16(S[i]);
        }

        for (int ic = 0; ic < nev1; ++ic) {
@ -6282,7 +6307,7 @@ void ggml_compute_forward_flash_ff_f16(
        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);

        for (int i = 0; i < M; i++) {
-            S16[i] = ggml_fp32_to_fp16(S[i]);
+            S16[i] = GGML_FP32_TO_FP16(S[i]);
        }

        ggml_vec_gelu_f16(neb01, S16, S16);
Author	SHA1	Message	Date
Georgi Gerganov	683f111088	ggml : initial tests with libnvblas	2022-12-08 22:01:52 +02:00
Georgi Gerganov	3996ecc156	Update README.md	2022-12-07 05:15:46 +02:00
Georgi Gerganov	faa85f9840	livestream.sh : remove obsolete comment	2022-12-07 04:41:43 +02:00
Georgi Gerganov	b6597539f9	ggml : fix typo in previous commit	2022-12-06 22:12:57 +02:00
Georgi Gerganov	9a4b7a916e	ggml : use macros to inline FP16 <-> FP32 conversions	2022-12-06 22:09:26 +02:00