forked from extern/whisper.cpp
ggml : speed-up soft max via Accelerate + unroll
This commit is contained in:
parent
d51fc3ee0a
commit
d61d55cd4b
162
ggml.c
162
ggml.c
@ -81,6 +81,7 @@ typedef void* thread_ret_t;
|
|||||||
|
|
||||||
#define GGML_DEBUG 0
|
#define GGML_DEBUG 0
|
||||||
#define GGML_GELU_FP16
|
#define GGML_GELU_FP16
|
||||||
|
#define GGML_SOFT_MAX_UNROLL 4
|
||||||
|
|
||||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||||
#define GGML_MEM_ALIGN 4
|
#define GGML_MEM_ALIGN 4
|
||||||
@ -310,6 +311,7 @@ int64_t ggml_cycles_per_ms(void) {
|
|||||||
return CLOCKS_PER_SEC/1000;
|
return CLOCKS_PER_SEC/1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//#define GGML_PERF
|
||||||
#ifdef GGML_PERF
|
#ifdef GGML_PERF
|
||||||
#define ggml_perf_time_ms() ggml_time_ms()
|
#define ggml_perf_time_ms() ggml_time_ms()
|
||||||
#define ggml_perf_time_us() ggml_time_us()
|
#define ggml_perf_time_us() ggml_time_us()
|
||||||
@ -1316,25 +1318,25 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|||||||
return GGML_TYPE_SIZE[tensor->type];
|
return GGML_TYPE_SIZE[tensor->type];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
@ -1343,7 +1345,7 @@ bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor *
|
|||||||
(t0->ne[3] == t1->ne[3]);
|
(t0->ne[3] == t1->ne[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
@ -1353,7 +1355,7 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
@ -1362,7 +1364,7 @@ bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
@ -1373,7 +1375,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check if t1 can be represented as a repeatition of t0
|
// check if t1 can be represented as a repeatition of t0
|
||||||
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
@ -1383,14 +1385,20 @@ bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t
|
|||||||
(t1->ne[3]%t0->ne[3] == 0);
|
(t1->ne[3]%t0->ne[3] == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_up32(int n) {
|
static inline int ggml_up32(int n) {
|
||||||
return (n + 31) & ~31;
|
return (n + 31) & ~31;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_up64(int n) {
|
static inline int ggml_up64(int n) {
|
||||||
return (n + 63) & ~63;
|
return (n + 63) & ~63;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int ggml_up(int n, int m) {
|
||||||
|
// assert m is a power of 2
|
||||||
|
GGML_ASSERT((m & (m - 1)) == 0);
|
||||||
|
return (n + m - 1) & ~(m - 1);
|
||||||
|
}
|
||||||
|
|
||||||
// assert that pointer is aligned to GGML_MEM_ALIGN
|
// assert that pointer is aligned to GGML_MEM_ALIGN
|
||||||
#define ggml_assert_aligned(ptr) \
|
#define ggml_assert_aligned(ptr) \
|
||||||
assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
||||||
@ -5094,21 +5102,19 @@ static void ggml_compute_forward_soft_max_f32(
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
for (int i = 0; i < nc; i++) {
|
ggml_vec_max_f32(nc, &max, p);
|
||||||
max = MAX(max, p[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_float sum = 0.0;
|
ggml_float sum = 0.0;
|
||||||
|
|
||||||
uint16_t ss;
|
uint16_t scvt;
|
||||||
for (int i = 0; i < nc; i++) {
|
for (int i = 0; i < nc; i++) {
|
||||||
if (p[i] == -INFINITY) {
|
if (p[i] == -INFINITY) {
|
||||||
p[i] = 0.0;
|
p[i] = 0.0f;
|
||||||
} else {
|
} else {
|
||||||
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
|
||||||
memcpy(&ss, &s, sizeof(ss));
|
memcpy(&scvt, &s, sizeof(scvt));
|
||||||
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
||||||
sum += val;
|
sum += val;
|
||||||
p[i] = val;
|
p[i] = val;
|
||||||
}
|
}
|
||||||
@ -5820,6 +5826,8 @@ static void ggml_compute_forward_flash_attn_f32(
|
|||||||
const int P = nek1 - N;
|
const int P = nek1 - N;
|
||||||
const int M = P + N;
|
const int M = P + N;
|
||||||
|
|
||||||
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == D);
|
GGML_ASSERT(ne0 == D);
|
||||||
GGML_ASSERT(ne1 == N);
|
GGML_ASSERT(ne1 == N);
|
||||||
GGML_ASSERT(P >= 0);
|
GGML_ASSERT(P >= 0);
|
||||||
@ -5872,7 +5880,11 @@ static void ggml_compute_forward_flash_attn_f32(
|
|||||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||||
|
|
||||||
float * S = (float *) params->wdata + ith*(M + CACHE_LINE_SIZE_F32);
|
float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
|
||||||
|
|
||||||
|
for (int i = M; i < Mup; ++i) {
|
||||||
|
S[i] = -INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
for (int ic = 0; ic < nek1; ++ic) {
|
for (int ic = 0; ic < nek1; ++ic) {
|
||||||
// k indices
|
// k indices
|
||||||
@ -5903,30 +5915,50 @@ static void ggml_compute_forward_flash_attn_f32(
|
|||||||
// softmax
|
// softmax
|
||||||
{
|
{
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
for (int i = 0; i < M; i++) {
|
ggml_vec_max_f32(M, &max, S);
|
||||||
max = MAX(max, S[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_float sum = 0.0;
|
float sum = 0.0f;
|
||||||
|
{
|
||||||
|
#ifndef GGML_USE_ACCELERATE
|
||||||
|
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
||||||
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
||||||
|
|
||||||
uint16_t ss;
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
||||||
for (int i = 0; i < M; i++) {
|
float * SS = S + i;
|
||||||
if (S[i] == -INFINITY) {
|
|
||||||
S[i] = 0.0;
|
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
||||||
|
if (SS[j] == -INFINITY) {
|
||||||
|
SS[j] = 0.0f;
|
||||||
} else {
|
} else {
|
||||||
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
||||||
memcpy(&ss, &s, sizeof(ss));
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
||||||
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
sump[j] += val;
|
||||||
sum += val;
|
SS[j] = val;
|
||||||
S[i] = val;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
||||||
|
sum += sump[i];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
vvexpf(S, S, &Mup);
|
||||||
|
ggml_vec_sum_f32(Mup, &sum, S);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
assert(sum > 0.0f);
|
assert(sum > 0.0f);
|
||||||
|
|
||||||
sum = 1.0/sum;
|
sum = 1.0/sum;
|
||||||
ggml_vec_scale_f32(M, S, sum);
|
ggml_vec_scale_f32(M, S, sum);
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
for (int i = 0; i < M; ++i) {
|
||||||
|
assert(!isnan(S[i]));
|
||||||
|
assert(!isinf(S[i]));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int ic = 0; ic < nev1; ++ic) {
|
for (int ic = 0; ic < nev1; ++ic) {
|
||||||
@ -6001,6 +6033,8 @@ static void ggml_compute_forward_flash_attn_f16(
|
|||||||
const int P = nek1 - N;
|
const int P = nek1 - N;
|
||||||
const int M = P + N;
|
const int M = P + N;
|
||||||
|
|
||||||
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == D);
|
GGML_ASSERT(ne0 == D);
|
||||||
GGML_ASSERT(ne1 == N);
|
GGML_ASSERT(ne1 == N);
|
||||||
GGML_ASSERT(P >= 0);
|
GGML_ASSERT(P >= 0);
|
||||||
@ -6053,7 +6087,11 @@ static void ggml_compute_forward_flash_attn_f16(
|
|||||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||||
|
|
||||||
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
|
||||||
|
|
||||||
|
for (int i = M; i < Mup; ++i) {
|
||||||
|
S[i] = -INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
for (int ic = 0; ic < nek1; ++ic) {
|
for (int ic = 0; ic < nek1; ++ic) {
|
||||||
// k indices
|
// k indices
|
||||||
@ -6084,30 +6122,50 @@ static void ggml_compute_forward_flash_attn_f16(
|
|||||||
// softmax
|
// softmax
|
||||||
{
|
{
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
for (int i = 0; i < M; i++) {
|
ggml_vec_max_f32(M, &max, S);
|
||||||
max = MAX(max, S[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_float sum = 0.0;
|
float sum = 0.0f;
|
||||||
|
{
|
||||||
|
#ifndef GGML_USE_ACCELERATE
|
||||||
|
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
||||||
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
||||||
|
|
||||||
uint16_t ss;
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
||||||
for (int i = 0; i < M; i++) {
|
float * SS = S + i;
|
||||||
if (S[i] == -INFINITY) {
|
|
||||||
S[i] = 0.0;
|
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
||||||
|
if (SS[j] == -INFINITY) {
|
||||||
|
SS[j] = 0.0f;
|
||||||
} else {
|
} else {
|
||||||
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
||||||
memcpy(&ss, &s, sizeof(ss));
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
||||||
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
sump[j] += val;
|
||||||
sum += val;
|
SS[j] = val;
|
||||||
S[i] = val;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
||||||
|
sum += sump[i];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
vvexpf(S, S, &Mup);
|
||||||
|
ggml_vec_sum_f32(Mup, &sum, S);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
assert(sum > 0.0f);
|
assert(sum > 0.0f);
|
||||||
|
|
||||||
sum = 1.0/sum;
|
sum = 1.0/sum;
|
||||||
ggml_vec_scale_f32(M, S, sum);
|
ggml_vec_scale_f32(M, S, sum);
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
for (int i = 0; i < M; ++i) {
|
||||||
|
assert(!isnan(S[i]));
|
||||||
|
assert(!isinf(S[i]));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
||||||
@ -7188,14 +7246,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
|
const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||||
|
|
||||||
if (node->src1->type == GGML_TYPE_F32) {
|
if (node->src1->type == GGML_TYPE_F32) {
|
||||||
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->src1->type == GGML_TYPE_F16) {
|
if (node->src1->type == GGML_TYPE_F16) {
|
||||||
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
@ -131,7 +131,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
|||||||
{ "su", { 98, "sundanese", } },
|
{ "su", { 98, "sundanese", } },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t MB = 1024*1024;
|
static const size_t MB = 3*1024*1024;
|
||||||
|
|
||||||
static const std::map<e_model, size_t> MEM_REQ_MODEL = {
|
static const std::map<e_model, size_t> MEM_REQ_MODEL = {
|
||||||
{ MODEL_TINY, 74ull*MB },
|
{ MODEL_TINY, 74ull*MB },
|
||||||
|
Loading…
Reference in New Issue
Block a user