whisper : fix VAD processing for skipped audio segments (#3230)

This commit addresses an issue with token timestamps when audio segments
are skipped, in `whisper_exp_compute_token_level_timestamps` related to
the VAD processing and the energy levels.

The motivation for this is that the token timestamps exceed the energy
array bounds due to segment timing misalignment:
```console
                  (skipped introduction)
                    ↓
Audio segment:     [2600ms → 5600ms]  (3 seconds of actual audio)
Energy array:      [0 → 480652]       (samples for 3 seconds)
Token timestamps:  [3266ms → 3408ms]  (absolute timestamps)
```
So both `s0` and `t1` get clamped to the maximum sample index (480652)
which causes the start/end timestamps to be the same for all the tokens
after a certain point.

This is addressed by using segment-relative timestamps in the
`timestamp_to_sample` and `sample_to_timestamp`.
This commit is contained in:
Daniel Bevenius 2025-06-13 17:35:52 +02:00 committed by GitHub
parent 0a4d85cf8a
commit 705db0f728
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -8325,10 +8325,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// token-level timestamps
//
static int timestamp_to_sample(int64_t t, int n_samples) {
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
}
static int64_t sample_to_timestamp(int i_sample) {
return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
}
@ -8378,6 +8374,18 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
return result;
}
static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) {
// Convert absolute timestamp to segment-relative timestamp
int64_t relative_t = t - segment_t0;
int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100);
return std::max(0, std::min(n_samples - 1, sample));
}
static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) {
int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE;
return relative_timestamp + segment_t0;
}
static void whisper_exp_compute_token_level_timestamps(
struct whisper_context & ctx,
struct whisper_state & state,
@ -8518,8 +8526,8 @@ static void whisper_exp_compute_token_level_timestamps(
continue;
}
int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples);
int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples);
const int ss0 = std::max(s0 - hw, 0);
const int ss1 = std::min(s1 + hw, n_samples);
@ -8540,7 +8548,7 @@ static void whisper_exp_compute_token_level_timestamps(
while (k > 0 && state.energy[k] > thold) {
k--;
}
tokens[j].t0 = sample_to_timestamp(k);
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
if (tokens[j].t0 < tokens[j - 1].t1) {
tokens[j].t0 = tokens[j - 1].t1;
} else {
@ -8551,7 +8559,7 @@ static void whisper_exp_compute_token_level_timestamps(
k++;
}
s0 = k;
tokens[j].t0 = sample_to_timestamp(k);
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
}
}
@ -8561,7 +8569,7 @@ static void whisper_exp_compute_token_level_timestamps(
while (k < n_samples - 1 && state.energy[k] > thold) {
k++;
}
tokens[j].t1 = sample_to_timestamp(k);
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
tokens[j].t1 = tokens[j + 1].t0;
} else {
@ -8572,7 +8580,7 @@ static void whisper_exp_compute_token_level_timestamps(
k--;
}
s1 = k;
tokens[j].t1 = sample_to_timestamp(k);
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
}
}
}