whisper.cpp/tests/test-vad-full.cpp
Daniel Bevenius e41bc5c61a
vad : add initial Voice Activity Detection (VAD) support (#3065)
* vad : add initial Voice Activity Detection (VAD) support

This commit add support for Voice Activity Detection (VAD). When enabled
this feature will process the audio input and detect speech segments.
This information is then used to reduce the number of samples that need
to be processed by whisper_full.

Resolves: https://github.com/ggml-org/whisper.cpp/issues/3003

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-05-12 16:10:11 +02:00

55 lines
1.8 KiB
C++

#include "whisper.h"
#include "common-whisper.h"
#include <cstdio>
#include <cfloat>
#include <string>
#include <cstring>
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cassert>
int main() {
std::string whisper_model_path = "../../models/ggml-base.en.bin";
std::string vad_model_path = "../../models/for-tests-silero-v5.1.2-ggml.bin";
std::string sample_path = "../../samples/jfk.wav";
// Load the sample audio file
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
struct whisper_context_params cparams = whisper_context_default_params();
struct whisper_context * wctx = whisper_init_from_file_with_params(
whisper_model_path.c_str(),
cparams);
struct whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);
wparams.vad = true;
wparams.vad_model_path = vad_model_path.c_str();
wparams.vad_params.threshold = 0.5f;
wparams.vad_params.min_speech_duration_ms = 250;
wparams.vad_params.min_silence_duration_ms = 100;
wparams.vad_params.max_speech_duration_s = FLT_MAX;
wparams.vad_params.speech_pad_ms = 30;
assert(whisper_full_parallel(wctx, wparams, pcmf32.data(), pcmf32.size(), 1) == 0);
const int n_segments = whisper_full_n_segments(wctx);
assert(n_segments == 1);
assert(strcmp(" And so my fellow Americans, ask not what your country can do for you,"
" ask what you can do for your country.",
whisper_full_get_segment_text(wctx, 0)) == 0);
assert(whisper_full_get_segment_t0(wctx, 0) == 29);
assert(whisper_full_get_segment_t1(wctx, 0) == 1049);
whisper_free(wctx);
return 0;
}