whisper.cpp/tests/test-vad.cpp
Daniel Bevenius e41bc5c61a
vad : add initial Voice Activity Detection (VAD) support (#3065)
* vad : add initial Voice Activity Detection (VAD) support

This commit add support for Voice Activity Detection (VAD). When enabled
this feature will process the audio input and detect speech segments.
This information is then used to reduce the number of samples that need
to be processed by whisper_full.

Resolves: https://github.com/ggml-org/whisper.cpp/issues/3003

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-05-12 16:10:11 +02:00

84 lines
2.7 KiB
C++

#include "whisper.h"
#include "common-whisper.h"
#include <cstdio>
#include <string>
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cassert>
void assert_default_params(const struct whisper_vad_params & params) {
assert(params.threshold == 0.5);
assert(params.min_speech_duration_ms == 250);
assert(params.min_silence_duration_ms == 100);
assert(params.samples_overlap == 0.1f);
}
void assert_default_context_params(const struct whisper_vad_context_params & params) {
assert(params.n_threads == 4);
assert(params.use_gpu == false);
assert(params.gpu_device == 0);
}
void test_detect_speech(
struct whisper_vad_context * vctx,
struct whisper_vad_params params,
const float * pcmf32,
int n_samples) {
assert(whisper_vad_detect_speech(vctx, pcmf32, n_samples));
assert(whisper_vad_n_probs(vctx) == 344);
assert(whisper_vad_probs(vctx) != nullptr);
}
struct whisper_vad_segments * test_detect_timestamps(
struct whisper_vad_context * vctx,
struct whisper_vad_params params) {
struct whisper_vad_segments * timestamps = whisper_vad_segments_from_probs(vctx, params);
assert(whisper_vad_segments_n_segments(timestamps) == 5);
for (int i = 0; i < whisper_vad_segments_n_segments(timestamps); ++i) {
printf("VAD segment %d: start = %.2f, end = %.2f\n", i,
whisper_vad_segments_get_segment_t0(timestamps, i),
whisper_vad_segments_get_segment_t1(timestamps, i));
}
return timestamps;
}
int main() {
std::string vad_model_path = "../../models/for-tests-silero-v5.1.2-ggml.bin";
std::string sample_path = "../../samples/jfk.wav";
// Load the sample audio file
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
assert(pcmf32.size() > 0);
assert(pcmf32s.size() == 0); // no stereo vector
// Load the VAD model
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
assert_default_context_params(ctx_params);
struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
vad_model_path.c_str(),
ctx_params);
assert(vctx != nullptr);
struct whisper_vad_params params = whisper_vad_default_params();
assert_default_params(params);
// Test speech probabilites
test_detect_speech(vctx, params, pcmf32.data(), pcmf32.size());
// Test speech timestamps (uses speech probabilities from above)
struct whisper_vad_segments * timestamps = test_detect_timestamps(vctx, params);
whisper_vad_free_segments(timestamps);
whisper_vad_free(vctx);
return 0;
}