whisper : allow whisper_full from mel spectrogram - no audio (#1214)

Co-authored-by: jbrough <jamie1612@gmail.com>
This commit is contained in:
Georgi Gerganov 2023-08-27 20:02:57 +03:00 committed by GitHub
parent 7e54df414e
commit b5bb5c85d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -3140,7 +3140,6 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
return false; return false;
} }
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) { if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
log("%s: failed to eval\n", __func__); log("%s: failed to eval\n", __func__);
return 1; return 1;
@ -3374,7 +3373,6 @@ float * whisper_get_logits(struct whisper_context * ctx) {
return ctx->state->logits.data(); return ctx->state->logits.data();
} }
float * whisper_get_logits_from_state(struct whisper_state * state) { float * whisper_get_logits_from_state(struct whisper_state * state) {
return state->logits.data(); return state->logits.data();
} }
@ -4087,6 +4085,7 @@ int whisper_full_with_state(
result_all.clear(); result_all.clear();
if (n_samples > 0) {
// compute log mel spectrogram // compute log mel spectrogram
if (params.speed_up) { if (params.speed_up) {
// TODO: Replace PV with more advanced algorithm // TODO: Replace PV with more advanced algorithm
@ -4098,6 +4097,7 @@ int whisper_full_with_state(
return -2; return -2;
} }
} }
}
// auto-detect language if not specified // auto-detect language if not specified
if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) { if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) {
@ -4121,8 +4121,10 @@ int whisper_full_with_state(
state->t_beg = 0; state->t_beg = 0;
state->t_last = 0; state->t_last = 0;
state->tid_last = 0; state->tid_last = 0;
if (n_samples > 0) {
state->energy = get_signal_energy(samples, n_samples, 32); state->energy = get_signal_energy(samples, n_samples, 32);
} }
}
const int seek_start = params.offset_ms/10; const int seek_start = params.offset_ms/10;
const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10; const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
@ -4813,7 +4815,6 @@ int whisper_full_with_state(
return 0; return 0;
} }
int whisper_full( int whisper_full(
struct whisper_context * ctx, struct whisper_context * ctx,
struct whisper_full_params params, struct whisper_full_params params,
@ -4890,7 +4891,6 @@ int whisper_full_parallel(
result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t; result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t; result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
// make sure that segments are not overlapping // make sure that segments are not overlapping
if (!ctx->state->result_all.empty()) { if (!ctx->state->result_all.empty()) {
result.t0 = std::max(result.t0, ctx->state->result_all.back().t1); result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);