forked from extern/whisper.cpp
ggml : add SSE3 and fp16 conversion lookup table (#368)
* Improves WASM performance: On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome * Add support for SSE3 SIMD * Add SSE3 to system information * Add Imath support for fp16-fp32 conversions * Add Imath to system information * Wrap Imath calls to avoid static function warnings * Drop Imath; Add lookup table for f16 -> f32 conversions * Remove TODO comments * Update SSE3 to new macro arguments * Correct updated macro definitions * Prefer static inline where possible * ggml : static inlines + add public f16 <-> f32 conversions Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
1944e7c33e
commit
a62170c656
4
Makefile
4
Makefile
@ -84,6 +84,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring sse3,$(SSE3_M)))
|
||||
CFLAGS += -msse3
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Haiku)
|
||||
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
|
162
ggml.c
162
ggml.c
@ -124,13 +124,8 @@ typedef double ggml_float;
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
||||
return x;
|
||||
}
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) (x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) (x)
|
||||
#define GGML_FP32_TO_FP16(x) (x)
|
||||
@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
||||
return _cvtsh_ss(h);
|
||||
}
|
||||
ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
||||
return _cvtss_sh(f, 0);
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
|
||||
#else
|
||||
|
||||
@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
|
||||
#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
|
||||
// precomputed exp table for f16 (128 KB)
|
||||
static ggml_fp16_t table_exp_f16[1 << 16];
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
static float table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
#endif
|
||||
|
||||
// note: do not use these inside ggml.c
|
||||
// these are meant to be used via the ggml.h API
|
||||
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
||||
return GGML_FP16_TO_FP32(x);
|
||||
}
|
||||
|
||||
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
||||
return GGML_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
//
|
||||
// timing
|
||||
//
|
||||
@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
||||
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
||||
|
||||
#elif defined(__SSE3__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 SSE
|
||||
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __m128
|
||||
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
||||
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
||||
#define GGML_F32x4_LOAD _mm_loadu_ps
|
||||
#define GGML_F32x4_STORE _mm_storeu_ps
|
||||
#if defined(__FMA__)
|
||||
// TODO: Does this work?
|
||||
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
||||
#else
|
||||
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
||||
#endif
|
||||
#define GGML_F32x4_ADD _mm_add_ps
|
||||
#define GGML_F32x4_MUL _mm_mul_ps
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
||||
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
|
||||
} \
|
||||
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
|
||||
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
|
||||
} \
|
||||
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
|
||||
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
|
||||
} \
|
||||
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
||||
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
||||
}
|
||||
// TODO: is this optimal ?
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
// F16 SSE
|
||||
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
||||
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
||||
|
||||
return _mm_loadu_ps(tmp);
|
||||
}
|
||||
|
||||
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
||||
float arr[4];
|
||||
|
||||
_mm_storeu_ps(arr, y);
|
||||
|
||||
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
||||
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
||||
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
||||
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
||||
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
||||
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
||||
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
||||
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32Cx4_ADD _mm_add_ps
|
||||
#define GGML_F32Cx4_MUL _mm_mul_ps
|
||||
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
|
||||
#endif
|
||||
|
||||
// GGML_F32_ARR / GGML_F16_ARR
|
||||
@ -1269,7 +1381,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
static bool is_first_call = true;
|
||||
|
||||
if (is_first_call) {
|
||||
// initialize GELU and EXP tables
|
||||
// initialize GELU, EXP and F32 tables
|
||||
{
|
||||
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
||||
|
||||
@ -1277,7 +1389,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
for (int i = 0; i < (1 << 16); ++i) {
|
||||
uint16_t ui = i;
|
||||
memcpy(&ii, &ui, sizeof(ii));
|
||||
const float f = GGML_FP16_TO_FP32(ii);
|
||||
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
||||
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
||||
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
|
||||
}
|
||||
@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
#if defined(__SSE3__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_vsx(void) {
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
return 1;
|
||||
|
1
ggml.h
1
ggml.h
@ -731,6 +731,7 @@ int ggml_cpu_has_f16c(void);
|
||||
int ggml_cpu_has_fp16_va(void);
|
||||
int ggml_cpu_has_wasm_simd(void);
|
||||
int ggml_cpu_has_blas(void);
|
||||
int ggml_cpu_has_sse3(void);
|
||||
int ggml_cpu_has_vsx(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -2582,6 +2582,7 @@ const char * whisper_print_system_info(void) {
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
|
Loading…
Reference in New Issue
Block a user