From 705db0f728310c32bc96f4e355e2b18076932f75 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 13 Jun 2025 17:35:52 +0200 Subject: [PATCH] whisper : fix VAD processing for skipped audio segments (#3230) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses an issue with token timestamps when audio segments are skipped, in `whisper_exp_compute_token_level_timestamps` related to the VAD processing and the energy levels. The motivation for this is that the token timestamps exceed the energy array bounds due to segment timing misalignment: ```console (skipped introduction) ↓ Audio segment: [2600ms → 5600ms] (3 seconds of actual audio) Energy array: [0 → 480652] (samples for 3 seconds) Token timestamps: [3266ms → 3408ms] (absolute timestamps) ``` So both `s0` and `t1` get clamped to the maximum sample index (480652) which causes the start/end timestamps to be the same for all the tokens after a certain point. This is addressed by using segment-relative timestamps in the `timestamp_to_sample` and `sample_to_timestamp`. --- src/whisper.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index a2f28d7d..6483ae8a 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -8325,10 +8325,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { // token-level timestamps // -static int timestamp_to_sample(int64_t t, int n_samples) { - return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100))); -} - static int64_t sample_to_timestamp(int i_sample) { return (100ll*i_sample)/WHISPER_SAMPLE_RATE; } @@ -8378,6 +8374,18 @@ static std::vector get_signal_energy(const float * signal, int n_samples, return result; } +static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) { + // Convert absolute timestamp to segment-relative timestamp + int64_t relative_t = t - segment_t0; + int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100); + return std::max(0, std::min(n_samples - 1, sample)); +} + +static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) { + int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE; + return relative_timestamp + segment_t0; +} + static void whisper_exp_compute_token_level_timestamps( struct whisper_context & ctx, struct whisper_state & state, @@ -8518,8 +8526,8 @@ static void whisper_exp_compute_token_level_timestamps( continue; } - int s0 = timestamp_to_sample(tokens[j].t0, n_samples); - int s1 = timestamp_to_sample(tokens[j].t1, n_samples); + int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples); + int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples); const int ss0 = std::max(s0 - hw, 0); const int ss1 = std::min(s1 + hw, n_samples); @@ -8540,7 +8548,7 @@ static void whisper_exp_compute_token_level_timestamps( while (k > 0 && state.energy[k] > thold) { k--; } - tokens[j].t0 = sample_to_timestamp(k); + tokens[j].t0 = sample_to_timestamp(k, segment.t0); if (tokens[j].t0 < tokens[j - 1].t1) { tokens[j].t0 = tokens[j - 1].t1; } else { @@ -8551,7 +8559,7 @@ static void whisper_exp_compute_token_level_timestamps( k++; } s0 = k; - tokens[j].t0 = sample_to_timestamp(k); + tokens[j].t0 = sample_to_timestamp(k, segment.t0); } } @@ -8561,7 +8569,7 @@ static void whisper_exp_compute_token_level_timestamps( while (k < n_samples - 1 && state.energy[k] > thold) { k++; } - tokens[j].t1 = sample_to_timestamp(k); + tokens[j].t1 = sample_to_timestamp(k, segment.t0); if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) { tokens[j].t1 = tokens[j + 1].t0; } else { @@ -8572,7 +8580,7 @@ static void whisper_exp_compute_token_level_timestamps( k--; } s1 = k; - tokens[j].t1 = sample_to_timestamp(k); + tokens[j].t1 = sample_to_timestamp(k, segment.t0); } } }