whisper : improve decoding strategy (#244)

- Clear past prompt when there is very short audio left for processing.
  My observation is that in these cases the decoding tends to repeat and
  hallucinate stuff and I think this is induced by the existing prompt
- When we fail to sample timestamp token, retry by clearing the past
  prompt. If it fails again, then we advance the window by 1 second
This commit is contained in:
Georgi Gerganov 2022-12-16 18:31:17 +02:00
parent a82d331034
commit 6a7c82501e
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -2650,10 +2650,17 @@ int whisper_full(
} }
} }
// of only 1 second left, then stop
if (seek + 100 >= seek_end) { if (seek + 100 >= seek_end) {
break; break;
} }
// if there is a very short audio segment left to process, we remove any past prompt since it tends
// to confuse the decoder and often make it repeat or hallucinate stuff
if (seek > seek_start && seek + 500 >= seek_end) {
prompt_past.clear();
}
if (params.encoder_begin_callback) { if (params.encoder_begin_callback) {
if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) { if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) {
fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__); fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
@ -2780,8 +2787,14 @@ int whisper_full(
} }
if (failed) { if (failed) {
fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__); // when we fail to sample timestamp token, retry by clearing the past prompt
seek += 100; // if it fails again, then we advance the window by 1 second
if (prompt_past.size() > 0) {
prompt_past.clear();
} else {
fprintf(stderr, "\n%s: failed to generate timestamp token - skipping one second\n\n", __func__);
seek += 100;
}
continue; continue;
} }