mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-08 03:07:10 +02:00
Compare commits
5 Commits
macros-cvt
...
nvblas
Author | SHA1 | Date | |
---|---|---|---|
683f111088 | |||
3996ecc156 | |||
faa85f9840 | |||
b6597539f9 | |||
9a4b7a916e |
30
README.md
30
README.md
@ -52,21 +52,6 @@ The tensor operators are optimized heavily for Apple silicon CPUs. Depending on
|
|||||||
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
|
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
|
||||||
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
|
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
|
||||||
|
|
||||||
## Limitations
|
|
||||||
|
|
||||||
- Inference only
|
|
||||||
- No GPU support
|
|
||||||
- Very basic greedy sampling scheme - always pick up the token with highest probability.
|
|
||||||
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
|
|
||||||
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
|
|
||||||
to run the python code with the following parameters:
|
|
||||||
|
|
||||||
```
|
|
||||||
whisper --best_of None --beam_size None ...
|
|
||||||
```
|
|
||||||
|
|
||||||
In the future, `whisper.cpp` will support more sampling strategies.
|
|
||||||
|
|
||||||
## Quick start
|
## Quick start
|
||||||
|
|
||||||
First, download one of the Whisper models converted in [ggml format](models). For example:
|
First, download one of the Whisper models converted in [ggml format](models). For example:
|
||||||
@ -220,6 +205,21 @@ make large
|
|||||||
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||||
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
|
||||||
|
- Inference only
|
||||||
|
- No GPU support
|
||||||
|
- Very basic greedy sampling scheme - always pick up the token with highest probability.
|
||||||
|
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
|
||||||
|
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
|
||||||
|
to run the python code with the following parameters:
|
||||||
|
|
||||||
|
```
|
||||||
|
whisper --best_of None --beam_size None ...
|
||||||
|
```
|
||||||
|
|
||||||
|
In the future, `whisper.cpp` will support more sampling strategies.
|
||||||
|
|
||||||
## Another example
|
## Another example
|
||||||
|
|
||||||
Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
|
Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
|
||||||
|
@ -4,10 +4,6 @@ set -eo pipefail
|
|||||||
# Idea by @semiformal-net
|
# Idea by @semiformal-net
|
||||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
|
||||||
#
|
#
|
||||||
# TODO:
|
|
||||||
# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
|
|
||||||
# way to produce a continuous stream of audio chunks.
|
|
||||||
#
|
|
||||||
|
|
||||||
url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
|
url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
|
||||||
fmt=aac # the audio format extension of the stream (TODO: auto detect)
|
fmt=aac # the audio format extension of the stream (TODO: auto detect)
|
||||||
|
121
ggml.c
121
ggml.c
@ -96,6 +96,8 @@ typedef void* thread_ret_t;
|
|||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#elif GGML_USE_OPENBLAS
|
#elif GGML_USE_OPENBLAS
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
|
// sgemm
|
||||||
|
extern void sgemm_(char* transa, char* transb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb, float* beta, float* c, int* ldc);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// floating point type used to accumulate sums
|
// floating point type used to accumulate sums
|
||||||
@ -120,6 +122,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) (x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) (x)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
#ifdef __wasm_simd128__
|
||||||
@ -139,6 +144,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
|||||||
return _cvtss_sh(f, 0);
|
return _cvtss_sh(f, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static inline float fp32_from_bits(uint32_t w) {
|
static inline float fp32_from_bits(uint32_t w) {
|
||||||
@ -205,8 +213,13 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
|||||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#endif
|
#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
|
||||||
|
|
||||||
|
#endif // __F16C__
|
||||||
|
|
||||||
|
#endif // __ARM_NEON
|
||||||
|
|
||||||
//
|
//
|
||||||
// global data
|
// global data
|
||||||
@ -589,7 +602,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|||||||
|
|
||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n32; i < n; ++i) {
|
for (int i = n32; i < n; ++i) {
|
||||||
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
||||||
}
|
}
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
// AVX 256-bit
|
// AVX 256-bit
|
||||||
@ -633,7 +646,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n32; i < n; ++i) {
|
for (int i = n32; i < n; ++i) {
|
||||||
//GGML_ASSERT(false);
|
//GGML_ASSERT(false);
|
||||||
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
||||||
}
|
}
|
||||||
#elif defined(__AVX__)
|
#elif defined(__AVX__)
|
||||||
// AVX 256-bit
|
// AVX 256-bit
|
||||||
@ -677,7 +690,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n32; i < n; ++i) {
|
for (int i = n32; i < n; ++i) {
|
||||||
//GGML_ASSERT(false);
|
//GGML_ASSERT(false);
|
||||||
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
||||||
}
|
}
|
||||||
#elif defined(__wasm_simd128__)
|
#elif defined(__wasm_simd128__)
|
||||||
// WASM 128-bit
|
// WASM 128-bit
|
||||||
@ -696,8 +709,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|||||||
|
|
||||||
for (int i = 0; i < n16; i += 16) {
|
for (int i = 0; i < n16; i += 16) {
|
||||||
for (int k = 0; k < 16; ++k) {
|
for (int k = 0; k < 16; ++k) {
|
||||||
tx[k] = ggml_fp16_to_fp32(x[i + k]);
|
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
|
||||||
ty[k] = ggml_fp16_to_fp32(y[i + k]);
|
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
x0 = wasm_v128_load(tx + 0);
|
x0 = wasm_v128_load(tx + 0);
|
||||||
@ -725,11 +738,11 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n16; i < n; ++i) {
|
for (int i = n16; i < n; ++i) {
|
||||||
//GGML_ASSERT(false);
|
//GGML_ASSERT(false);
|
||||||
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -966,7 +979,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n32; i < n; ++i) {
|
for (int i = n32; i < n; ++i) {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||||
}
|
}
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
// AVX 256-bit
|
// AVX 256-bit
|
||||||
@ -1002,7 +1015,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n32; i < n; ++i) {
|
for (int i = n32; i < n; ++i) {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||||
}
|
}
|
||||||
#elif defined(__AVX__)
|
#elif defined(__AVX__)
|
||||||
// AVX 256-bit
|
// AVX 256-bit
|
||||||
@ -1038,7 +1051,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n32; i < n; ++i) {
|
for (int i = n32; i < n; ++i) {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||||
}
|
}
|
||||||
#elif defined(__wasm_simd128__)
|
#elif defined(__wasm_simd128__)
|
||||||
// WASM SIMD 128-bit
|
// WASM SIMD 128-bit
|
||||||
@ -1054,8 +1067,8 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|||||||
|
|
||||||
for (int i = 0; i < n16; i += 16) {
|
for (int i = 0; i < n16; i += 16) {
|
||||||
for (int k = 0; k < 16; ++k) {
|
for (int k = 0; k < 16; ++k) {
|
||||||
tx[k] = ggml_fp16_to_fp32(x[i + k]);
|
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
|
||||||
ty[k] = ggml_fp16_to_fp32(y[i + k]);
|
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
x0 = wasm_v128_load(tx + 0);
|
x0 = wasm_v128_load(tx + 0);
|
||||||
@ -1079,18 +1092,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|||||||
wasm_v128_store(ty + 12, y3);
|
wasm_v128_store(ty + 12, y3);
|
||||||
|
|
||||||
for (int k = 0; k < 16; ++k) {
|
for (int k = 0; k < 16; ++k) {
|
||||||
y[i + k] = ggml_fp32_to_fp16(ty[k]);
|
y[i + k] = GGML_FP32_TO_FP16(ty[k]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// leftovers
|
// leftovers
|
||||||
for (int i = n16; i < n; ++i) {
|
for (int i = n16; i < n; ++i) {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -1122,9 +1135,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|||||||
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||||
uint16_t t;
|
uint16_t t;
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
|
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
||||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||||
y[i] = ggml_fp16_to_fp32(table_gelu_f16[t]);
|
y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -1472,9 +1485,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
for (int i = 0; i < (1 << 16); ++i) {
|
for (int i = 0; i < (1 << 16); ++i) {
|
||||||
uint16_t ui = i;
|
uint16_t ui = i;
|
||||||
memcpy(&ii, &ui, sizeof(ii));
|
memcpy(&ii, &ui, sizeof(ii));
|
||||||
const float f = ggml_fp16_to_fp32(ii);
|
const float f = GGML_FP16_TO_FP32(ii);
|
||||||
table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
|
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
||||||
table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
|
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
||||||
@ -1857,7 +1870,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
|
|||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||||
return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
@ -1893,7 +1906,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
|
|||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||||
((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
@ -1927,7 +1940,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||||
return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
@ -1963,7 +1976,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||||
((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
@ -3227,7 +3240,7 @@ void ggml_compute_forward_dup_f32(
|
|||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
|
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
||||||
id++;
|
id++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3265,7 +3278,7 @@ void ggml_compute_forward_dup_f32(
|
|||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
|
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
||||||
id++;
|
id++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -4547,7 +4560,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|||||||
int id = 0;
|
int id = 0;
|
||||||
for (int i01 = 0; i01 < ne01; ++i01) {
|
for (int i01 = 0; i01 < ne01; ++i01) {
|
||||||
for (int i00 = 0; i00 < ne00; ++i00) {
|
for (int i00 = 0; i00 < ne00; ++i00) {
|
||||||
wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -4577,11 +4590,23 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|||||||
|
|
||||||
// zT = y * xT
|
// zT = y * xT
|
||||||
{
|
{
|
||||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
//cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
ne11, ne01, ne10,
|
// ne11, ne01, ne10,
|
||||||
1.0f, y, ne10,
|
// 1.0f, y, ne10,
|
||||||
x, ne10,
|
// x, ne10,
|
||||||
0.0f, d, ne01);
|
// 0.0f, d, ne01);
|
||||||
|
|
||||||
|
// this is compatible with nvblas
|
||||||
|
float one = 1.0f;
|
||||||
|
float zero = 0.0f;
|
||||||
|
sgemm_(
|
||||||
|
"T", "N",
|
||||||
|
&ne0, &ne1, &ne10,
|
||||||
|
&one,
|
||||||
|
x, &ne10,
|
||||||
|
y, &ne10,
|
||||||
|
&zero,
|
||||||
|
d, &ne0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -4601,7 +4626,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|||||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||||
for (int i11 = 0; i11 < ne11; ++i11) {
|
for (int i11 = 0; i11 < ne11; ++i11) {
|
||||||
for (int i10 = 0; i10 < ne10; ++i10) {
|
for (int i10 = 0; i10 < ne10; ++i10) {
|
||||||
wdata[id++] = ggml_fp32_to_fp16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -4635,12 +4660,12 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|||||||
const int ic1 = MIN(ic0 + dc, ne);
|
const int ic1 = MIN(ic0 + dc, ne);
|
||||||
|
|
||||||
for (int i = ic0; i < ic1; ++i) {
|
for (int i = ic0; i < ic1; ++i) {
|
||||||
((float *) dst->data)[i] = ggml_fp16_to_fp32(wdata[i]);
|
((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int k = 1; k < nth; k++) {
|
for (int k = 1; k < nth; k++) {
|
||||||
for (int i = ic0; i < ic1; ++i) {
|
for (int i = ic0; i < ic1; ++i) {
|
||||||
((float *) dst->data)[i] += ggml_fp16_to_fp32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
|
((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4911,7 +4936,7 @@ void ggml_compute_forward_get_rows_f16(
|
|||||||
|
|
||||||
for (int j = 0; j < nc; ++j) {
|
for (int j = 0; j < nc; ++j) {
|
||||||
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
|
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
|
||||||
((float *) ((char *) dst->data + i*dst->nb[1]))[j] = ggml_fp16_to_fp32(v);
|
((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -5077,9 +5102,9 @@ void ggml_compute_forward_soft_max_f32(
|
|||||||
p[i] = 0.0;
|
p[i] = 0.0;
|
||||||
} else {
|
} else {
|
||||||
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
||||||
ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
|
||||||
memcpy(&ss, &s, sizeof(ss));
|
memcpy(&ss, &s, sizeof(ss));
|
||||||
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
||||||
sum += val;
|
sum += val;
|
||||||
p[i] = val;
|
p[i] = val;
|
||||||
}
|
}
|
||||||
@ -5283,7 +5308,7 @@ void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|||||||
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
||||||
ggml_fp16_t * dst_data = wdata;
|
ggml_fp16_t * dst_data = wdata;
|
||||||
for (int i10 = 0; i10 < ne10; i10++) {
|
for (int i10 = 0; i10 < ne10; i10++) {
|
||||||
dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -5549,7 +5574,7 @@ void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|||||||
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
||||||
ggml_fp16_t * dst_data = wdata;
|
ggml_fp16_t * dst_data = wdata;
|
||||||
for (int i10 = 0; i10 < ne10; i10++) {
|
for (int i10 = 0; i10 < ne10; i10++) {
|
||||||
dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -5886,9 +5911,9 @@ void ggml_compute_forward_flash_attn_f32(
|
|||||||
S[i] = 0.0;
|
S[i] = 0.0;
|
||||||
} else {
|
} else {
|
||||||
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
||||||
ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
|
||||||
memcpy(&ss, &s, sizeof(ss));
|
memcpy(&ss, &s, sizeof(ss));
|
||||||
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
||||||
sum += val;
|
sum += val;
|
||||||
S[i] = val;
|
S[i] = val;
|
||||||
}
|
}
|
||||||
@ -6067,9 +6092,9 @@ void ggml_compute_forward_flash_attn_f16(
|
|||||||
S[i] = 0.0;
|
S[i] = 0.0;
|
||||||
} else {
|
} else {
|
||||||
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
||||||
ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
|
||||||
memcpy(&ss, &s, sizeof(ss));
|
memcpy(&ss, &s, sizeof(ss));
|
||||||
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
||||||
sum += val;
|
sum += val;
|
||||||
S[i] = val;
|
S[i] = val;
|
||||||
}
|
}
|
||||||
@ -6084,7 +6109,7 @@ void ggml_compute_forward_flash_attn_f16(
|
|||||||
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
||||||
|
|
||||||
for (int i = 0; i < M; i++) {
|
for (int i = 0; i < M; i++) {
|
||||||
S16[i] = ggml_fp32_to_fp16(S[i]);
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int ic = 0; ic < nev1; ++ic) {
|
for (int ic = 0; ic < nev1; ++ic) {
|
||||||
@ -6282,7 +6307,7 @@ void ggml_compute_forward_flash_ff_f16(
|
|||||||
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
||||||
|
|
||||||
for (int i = 0; i < M; i++) {
|
for (int i = 0; i < M; i++) {
|
||||||
S16[i] = ggml_fp32_to_fp16(S[i]);
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_gelu_f16(neb01, S16, S16);
|
ggml_vec_gelu_f16(neb01, S16, S16);
|
||||||
|
Reference in New Issue
Block a user