Compare commits

...

5 Commits

3 changed files with 88 additions and 67 deletions

View File

@ -52,21 +52,6 @@ The tensor operators are optimized heavily for Apple silicon CPUs. Depending on
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
## Limitations
- Inference only
- No GPU support
- Very basic greedy sampling scheme - always pick up the token with highest probability.
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
to run the python code with the following parameters:
```
whisper --best_of None --beam_size None ...
```
In the future, `whisper.cpp` will support more sampling strategies.
## Quick start
First, download one of the Whisper models converted in [ggml format](models). For example:
@ -220,6 +205,21 @@ make large
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
## Limitations
- Inference only
- No GPU support
- Very basic greedy sampling scheme - always pick up the token with highest probability.
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
to run the python code with the following parameters:
```
whisper --best_of None --beam_size None ...
```
In the future, `whisper.cpp` will support more sampling strategies.
## Another example
Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)

View File

@ -4,10 +4,6 @@ set -eo pipefail
# Idea by @semiformal-net
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
#
# TODO:
# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
# way to produce a continuous stream of audio chunks.
#
url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
fmt=aac # the audio format extension of the stream (TODO: auto detect)

121
ggml.c
View File

@ -96,6 +96,8 @@ typedef void* thread_ret_t;
#include <Accelerate/Accelerate.h>
#elif GGML_USE_OPENBLAS
#include <cblas.h>
// sgemm
extern void sgemm_(char* transa, char* transb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb, float* beta, float* c, int* ldc);
#endif
// floating point type used to accumulate sums
@ -120,6 +122,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
return x;
}
#define GGML_FP16_TO_FP32(x) (x)
#define GGML_FP32_TO_FP16(x) (x)
#else
#ifdef __wasm_simd128__
@ -139,6 +144,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
return _cvtss_sh(f, 0);
}
#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#else
static inline float fp32_from_bits(uint32_t w) {
@ -205,8 +213,13 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#endif
#endif
#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
#endif // __F16C__
#endif // __ARM_NEON
//
// global data
@ -589,7 +602,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
// leftovers
for (int i = n32; i < n; ++i) {
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#elif defined(__AVX2__)
// AVX 256-bit
@ -633,7 +646,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
// leftovers
for (int i = n32; i < n; ++i) {
//GGML_ASSERT(false);
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#elif defined(__AVX__)
// AVX 256-bit
@ -677,7 +690,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
// leftovers
for (int i = n32; i < n; ++i) {
//GGML_ASSERT(false);
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#elif defined(__wasm_simd128__)
// WASM 128-bit
@ -696,8 +709,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
for (int i = 0; i < n16; i += 16) {
for (int k = 0; k < 16; ++k) {
tx[k] = ggml_fp16_to_fp32(x[i + k]);
ty[k] = ggml_fp16_to_fp32(y[i + k]);
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
}
x0 = wasm_v128_load(tx + 0);
@ -725,11 +738,11 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
// leftovers
for (int i = n16; i < n; ++i) {
//GGML_ASSERT(false);
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#else
for (int i = 0; i < n; ++i) {
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#endif
@ -966,7 +979,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
// leftovers
for (int i = n32; i < n; ++i) {
GGML_ASSERT(false);
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#elif defined(__AVX2__)
// AVX 256-bit
@ -1002,7 +1015,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
// leftovers
for (int i = n32; i < n; ++i) {
GGML_ASSERT(false);
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#elif defined(__AVX__)
// AVX 256-bit
@ -1038,7 +1051,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
// leftovers
for (int i = n32; i < n; ++i) {
GGML_ASSERT(false);
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#elif defined(__wasm_simd128__)
// WASM SIMD 128-bit
@ -1054,8 +1067,8 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
for (int i = 0; i < n16; i += 16) {
for (int k = 0; k < 16; ++k) {
tx[k] = ggml_fp16_to_fp32(x[i + k]);
ty[k] = ggml_fp16_to_fp32(y[i + k]);
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
}
x0 = wasm_v128_load(tx + 0);
@ -1079,18 +1092,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
wasm_v128_store(ty + 12, y3);
for (int k = 0; k < 16; ++k) {
y[i + k] = ggml_fp32_to_fp16(ty[k]);
y[i + k] = GGML_FP32_TO_FP16(ty[k]);
}
}
// leftovers
for (int i = n16; i < n; ++i) {
GGML_ASSERT(false);
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#else
for (int i = 0; i < n; ++i) {
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#endif
}
@ -1122,9 +1135,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
uint16_t t;
for (int i = 0; i < n; ++i) {
ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
memcpy(&t, &fp16, sizeof(uint16_t));
y[i] = ggml_fp16_to_fp32(table_gelu_f16[t]);
y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
}
}
#else
@ -1472,9 +1485,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
for (int i = 0; i < (1 << 16); ++i) {
uint16_t ui = i;
memcpy(&ii, &ui, sizeof(ii));
const float f = ggml_fp16_to_fp32(ii);
table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
const float f = GGML_FP16_TO_FP32(ii);
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
}
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@ -1857,7 +1870,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break;
case GGML_TYPE_F32:
{
@ -1893,7 +1906,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_F32:
{
@ -1927,7 +1940,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break;
case GGML_TYPE_F32:
{
@ -1963,7 +1976,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_F32:
{
@ -3227,7 +3240,7 @@ void ggml_compute_forward_dup_f32(
for (int i00 = 0; i00 < ne00; i00++) {
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
id++;
}
}
@ -3265,7 +3278,7 @@ void ggml_compute_forward_dup_f32(
for (int i00 = 0; i00 < ne00; i00++) {
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
id++;
}
}
@ -4547,7 +4560,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
int id = 0;
for (int i01 = 0; i01 < ne01; ++i01) {
for (int i00 = 0; i00 < ne00; ++i00) {
wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
}
}
}
@ -4577,11 +4590,23 @@ void ggml_compute_forward_mul_mat_f16_f32(
// zT = y * xT
{
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne10,
0.0f, d, ne01);
//cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
// ne11, ne01, ne10,
// 1.0f, y, ne10,
// x, ne10,
// 0.0f, d, ne01);
// this is compatible with nvblas
float one = 1.0f;
float zero = 0.0f;
sgemm_(
"T", "N",
&ne0, &ne1, &ne10,
&one,
x, &ne10,
y, &ne10,
&zero,
d, &ne0);
}
}
}
@ -4601,7 +4626,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) {
for (int i10 = 0; i10 < ne10; ++i10) {
wdata[id++] = ggml_fp32_to_fp16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
}
}
}
@ -4635,12 +4660,12 @@ void ggml_compute_forward_mul_mat_f16_f32(
const int ic1 = MIN(ic0 + dc, ne);
for (int i = ic0; i < ic1; ++i) {
((float *) dst->data)[i] = ggml_fp16_to_fp32(wdata[i]);
((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
}
for (int k = 1; k < nth; k++) {
for (int i = ic0; i < ic1; ++i) {
((float *) dst->data)[i] += ggml_fp16_to_fp32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
}
}
@ -4911,7 +4936,7 @@ void ggml_compute_forward_get_rows_f16(
for (int j = 0; j < nc; ++j) {
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
((float *) ((char *) dst->data + i*dst->nb[1]))[j] = ggml_fp16_to_fp32(v);
((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
}
}
}
@ -5077,9 +5102,9 @@ void ggml_compute_forward_soft_max_f32(
p[i] = 0.0;
} else {
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
memcpy(&ss, &s, sizeof(ss));
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
p[i] = val;
}
@ -5283,7 +5308,7 @@ void ggml_compute_forward_conv_1d_1s_f16_f32(
const float * const src = (float *)((char *) src1->data + i11*nb11);
ggml_fp16_t * dst_data = wdata;
for (int i10 = 0; i10 < ne10; i10++) {
dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
@ -5549,7 +5574,7 @@ void ggml_compute_forward_conv_1d_2s_f16_f32(
const float * const src = (float *)((char *) src1->data + i11*nb11);
ggml_fp16_t * dst_data = wdata;
for (int i10 = 0; i10 < ne10; i10++) {
dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
@ -5886,9 +5911,9 @@ void ggml_compute_forward_flash_attn_f32(
S[i] = 0.0;
} else {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
memcpy(&ss, &s, sizeof(ss));
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
S[i] = val;
}
@ -6067,9 +6092,9 @@ void ggml_compute_forward_flash_attn_f16(
S[i] = 0.0;
} else {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
memcpy(&ss, &s, sizeof(ss));
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
S[i] = val;
}
@ -6084,7 +6109,7 @@ void ggml_compute_forward_flash_attn_f16(
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
for (int i = 0; i < M; i++) {
S16[i] = ggml_fp32_to_fp16(S[i]);
S16[i] = GGML_FP32_TO_FP16(S[i]);
}
for (int ic = 0; ic < nev1; ++ic) {
@ -6282,7 +6307,7 @@ void ggml_compute_forward_flash_ff_f16(
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
for (int i = 0; i < M; i++) {
S16[i] = ggml_fp32_to_fp16(S[i]);
S16[i] = GGML_FP32_TO_FP16(S[i]);
}
ggml_vec_gelu_f16(neb01, S16, S16);