Compare commits

...

2 Commits

View File

@ -1659,24 +1659,15 @@ static struct ggml_cgraph * whisper_build_graph_conv(
ggml_set_name(cur, "embd_conv"); ggml_set_name(cur, "embd_conv");
wstate.embd_conv = cur; wstate.embd_conv = cur;
} else { } else {
#ifdef WHISPER_USE_COREML // transform the "mel" tensor to "embd_enc" via a sequence of ggml ops
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); // these are not actually executed when using external encoder
ggml_allocr_alloc(alloc, cur); // necessary only to prepare tensors with the appropriate memory sizes
cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1); // (conv)
if (!ggml_allocr_is_measure(alloc)) { cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1); // (conv)
whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) cur->data); cur = ggml_add(ctx0, model.e_pe, ggml_cont(ctx0, ggml_transpose(ctx0, cur))); // (cross)
}
#endif
#ifdef WHISPER_USE_OPENVINO
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
ggml_allocr_alloc(alloc, cur);
if (!ggml_allocr_is_measure(alloc)) {
whisper_openvino_encode(wstate.ctx_openvino, mel, cur);
}
#endif
ggml_set_name(cur, "embd_enc"); ggml_set_name(cur, "embd_enc");
ggml_set_output(cur);
wstate.embd_enc = cur; wstate.embd_enc = cur;
} }
@ -1708,14 +1699,6 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false); ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
//ggml_allocr * alloc = wstate.alloc_encode.alloc;
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
//ggml_allocr_alloc(alloc, cur);
//if (!ggml_allocr_is_measure(alloc)) {
// ggml_backend_tensor_copy(wstate.embd_conv, cur);
//}
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv); struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
const float KQscale = 1.0f/sqrtf(float(n_state)/n_head); const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
@ -1957,14 +1940,6 @@ static struct ggml_cgraph * whisper_build_graph_cross(
ggml_cgraph * gf = ggml_new_graph(ctx0); ggml_cgraph * gf = ggml_new_graph(ctx0);
//ggml_allocr * alloc = wstate.alloc_cross.alloc;
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
//ggml_allocr_alloc(alloc, cur);
//if (!ggml_allocr_is_measure(alloc)) {
// ggml_backend_tensor_copy(wstate.embd_enc, cur);
//}
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc); struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
const float Kscale = pow(float(n_state) / n_head, -0.25); const float Kscale = pow(float(n_state) / n_head, -0.25);
@ -2037,13 +2012,13 @@ static bool whisper_encode_internal(
return false; return false;
} }
struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
// set the input // set the input
{ {
const auto & mel_inp = wstate.mel; const auto & mel_inp = wstate.mel;
const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx; const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx;
struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
assert(mel->type == GGML_TYPE_F32); assert(mel->type == GGML_TYPE_F32);
assert(mel_inp.n_mel == wctx.model.hparams.n_mels); assert(mel_inp.n_mel == wctx.model.hparams.n_mels);
@ -2068,6 +2043,12 @@ static bool whisper_encode_internal(
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
return false; return false;
} }
} else {
#if defined(WHISPER_USE_COREML)
whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
#elif defined(WHISPER_USE_OPENVINO)
whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc);
#endif
} }
} }