mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-09 05:15:09 +02:00
feat: support vad for addon.node (#3301)
Co-authored-by: linxiaodong <calm.lin@wukongsch.com>
This commit is contained in:
@ -1,8 +1,10 @@
|
||||
# addon
|
||||
# whisper.cpp Node.js addon
|
||||
|
||||
This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
|
||||
It can be used as a reference for using the whisper.cpp project in other node projects.
|
||||
|
||||
This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance.
|
||||
|
||||
## Install
|
||||
|
||||
```shell
|
||||
@ -26,12 +28,88 @@ For Electron addon and cmake-js options, you can see [cmake-js](https://github.c
|
||||
|
||||
## Run
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```shell
|
||||
cd examples/addon.node
|
||||
|
||||
node index.js --language='language' --model='model-path' --fname_inp='file-path'
|
||||
```
|
||||
|
||||
Because this is a simple Demo, only the above parameters are set in the node environment.
|
||||
### VAD (Voice Activity Detection) Usage
|
||||
|
||||
Other parameters can also be specified in the node environment.
|
||||
Run the VAD example with performance comparison:
|
||||
|
||||
```shell
|
||||
node vad-example.js
|
||||
```
|
||||
|
||||
## Voice Activity Detection (VAD) Support
|
||||
|
||||
VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
|
||||
|
||||
### VAD Model Setup
|
||||
|
||||
Before using VAD, download a VAD model:
|
||||
|
||||
```shell
|
||||
# From the whisper.cpp root directory
|
||||
./models/download-vad-model.sh silero-v5.1.2
|
||||
```
|
||||
|
||||
### VAD Parameters
|
||||
|
||||
All VAD parameters are optional and have sensible defaults:
|
||||
|
||||
- `vad`: Enable VAD (default: false)
|
||||
- `vad_model`: Path to VAD model file (required when VAD enabled)
|
||||
- `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5)
|
||||
- `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250)
|
||||
- `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100)
|
||||
- `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX)
|
||||
- `vad_speech_pad_ms`: Speech padding in ms (default: 30)
|
||||
- `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1)
|
||||
|
||||
### JavaScript API Example
|
||||
|
||||
```javascript
|
||||
const path = require("path");
|
||||
const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node"));
|
||||
const { promisify } = require("util");
|
||||
|
||||
const whisperAsync = promisify(whisper);
|
||||
|
||||
// With VAD enabled
|
||||
const vadParams = {
|
||||
language: "en",
|
||||
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
||||
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
||||
vad: true,
|
||||
vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"),
|
||||
vad_threshold: 0.5,
|
||||
progress_callback: (progress) => console.log(`Progress: ${progress}%`)
|
||||
};
|
||||
|
||||
whisperAsync(vadParams).then(result => console.log(result));
|
||||
```
|
||||
|
||||
## Supported Parameters
|
||||
|
||||
Both traditional whisper.cpp parameters and new VAD parameters are supported:
|
||||
|
||||
- `language`: Language code (e.g., "en", "es", "fr")
|
||||
- `model`: Path to whisper model file
|
||||
- `fname_inp`: Path to input audio file
|
||||
- `use_gpu`: Enable GPU acceleration (default: true)
|
||||
- `flash_attn`: Enable flash attention (default: false)
|
||||
- `no_prints`: Disable console output (default: false)
|
||||
- `no_timestamps`: Disable timestamps (default: false)
|
||||
- `detect_language`: Auto-detect language (default: false)
|
||||
- `audio_ctx`: Audio context size (default: 0)
|
||||
- `max_len`: Maximum segment length (default: 0)
|
||||
- `max_context`: Maximum context size (default: -1)
|
||||
- `prompt`: Initial prompt for decoder
|
||||
- `comma_in_time`: Use comma in timestamps (default: true)
|
||||
- `print_progress`: Print progress info (default: false)
|
||||
- `progress_callback`: Progress callback function
|
||||
- VAD parameters (see above section)
|
||||
|
@ -1,39 +1,133 @@
|
||||
const path = require("path");
|
||||
const { whisper } = require(path.join(
|
||||
__dirname,
|
||||
"../../../build/Release/addon.node"
|
||||
));
|
||||
const { promisify } = require("util");
|
||||
const { join } = require('path');
|
||||
const { whisper } = require('../../../build/Release/addon.node');
|
||||
const { promisify } = require('util');
|
||||
|
||||
const whisperAsync = promisify(whisper);
|
||||
|
||||
const whisperParamsMock = {
|
||||
language: "en",
|
||||
model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
|
||||
fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
|
||||
const commonParams = {
|
||||
language: 'en',
|
||||
model: join(__dirname, '../../../models/ggml-base.en.bin'),
|
||||
fname_inp: join(__dirname, '../../../samples/jfk.wav'),
|
||||
use_gpu: true,
|
||||
flash_attn: false,
|
||||
no_prints: true,
|
||||
comma_in_time: false,
|
||||
translate: true,
|
||||
no_timestamps: false,
|
||||
detect_language: false,
|
||||
audio_ctx: 0,
|
||||
max_len: 0,
|
||||
prompt: "",
|
||||
print_progress: false,
|
||||
progress_callback: (progress) => {
|
||||
console.log(`Progress: ${progress}`);
|
||||
},
|
||||
max_context: -1
|
||||
max_len: 0
|
||||
};
|
||||
|
||||
describe("Run whisper.node", () => {
|
||||
test("it should receive a non-empty value", async () => {
|
||||
let result = await whisperAsync(whisperParamsMock);
|
||||
console.log(result);
|
||||
describe('Whisper.cpp Node.js addon with VAD support', () => {
|
||||
test('Basic whisper transcription without VAD', async () => {
|
||||
const params = {
|
||||
...commonParams,
|
||||
vad: false
|
||||
};
|
||||
|
||||
expect(result['transcription'].length).toBeGreaterThan(0);
|
||||
}, 10000);
|
||||
const result = await whisperAsync(params);
|
||||
|
||||
expect(typeof result).toBe('object');
|
||||
expect(Array.isArray(result.transcription)).toBe(true);
|
||||
expect(result.transcription.length).toBeGreaterThan(0);
|
||||
|
||||
// Check that we got some transcription text
|
||||
const text = result.transcription.map(segment => segment[2]).join(' ');
|
||||
expect(text.length).toBeGreaterThan(0);
|
||||
expect(text.toLowerCase()).toContain('ask not');
|
||||
}, 30000);
|
||||
|
||||
test('VAD parameters validation', async () => {
|
||||
// Test with invalid VAD model - should return empty transcription
|
||||
const invalidParams = {
|
||||
...commonParams,
|
||||
vad: true,
|
||||
vad_model: 'non-existent-model.bin',
|
||||
vad_threshold: 0.5
|
||||
};
|
||||
|
||||
// This should handle the error gracefully and return empty transcription
|
||||
const result = await whisperAsync(invalidParams);
|
||||
expect(typeof result).toBe('object');
|
||||
expect(Array.isArray(result.transcription)).toBe(true);
|
||||
// When VAD model doesn't exist, it should return empty transcription
|
||||
expect(result.transcription.length).toBe(0);
|
||||
}, 10000);
|
||||
|
||||
test('VAD parameter parsing', async () => {
|
||||
// Test that VAD parameters are properly parsed (even if VAD model doesn't exist)
|
||||
const vadParams = {
|
||||
...commonParams,
|
||||
vad: false, // Disabled so no model required
|
||||
vad_threshold: 0.7,
|
||||
vad_min_speech_duration_ms: 300,
|
||||
vad_min_silence_duration_ms: 150,
|
||||
vad_max_speech_duration_s: 45.0,
|
||||
vad_speech_pad_ms: 50,
|
||||
vad_samples_overlap: 0.15
|
||||
};
|
||||
|
||||
const result = await whisperAsync(vadParams);
|
||||
|
||||
expect(typeof result).toBe('object');
|
||||
expect(Array.isArray(result.transcription)).toBe(true);
|
||||
}, 30000);
|
||||
|
||||
test('Progress callback with VAD disabled', async () => {
|
||||
let progressCalled = false;
|
||||
let lastProgress = 0;
|
||||
|
||||
const params = {
|
||||
...commonParams,
|
||||
vad: false,
|
||||
progress_callback: (progress) => {
|
||||
progressCalled = true;
|
||||
lastProgress = progress;
|
||||
expect(progress).toBeGreaterThanOrEqual(0);
|
||||
expect(progress).toBeLessThanOrEqual(100);
|
||||
}
|
||||
};
|
||||
|
||||
const result = await whisperAsync(params);
|
||||
|
||||
expect(progressCalled).toBe(true);
|
||||
expect(lastProgress).toBe(100);
|
||||
expect(typeof result).toBe('object');
|
||||
}, 30000);
|
||||
|
||||
test('Language detection without VAD', async () => {
|
||||
const params = {
|
||||
...commonParams,
|
||||
vad: false,
|
||||
detect_language: true,
|
||||
language: 'auto'
|
||||
};
|
||||
|
||||
const result = await whisperAsync(params);
|
||||
|
||||
expect(typeof result).toBe('object');
|
||||
expect(typeof result.language).toBe('string');
|
||||
expect(result.language.length).toBeGreaterThan(0);
|
||||
}, 30000);
|
||||
|
||||
test('Basic transcription with all VAD parameters set', async () => {
|
||||
// Test with VAD disabled but all parameters set to ensure no crashes
|
||||
const params = {
|
||||
...commonParams,
|
||||
vad: false, // Disabled so it works without VAD model
|
||||
vad_model: '', // Empty model path
|
||||
vad_threshold: 0.6,
|
||||
vad_min_speech_duration_ms: 200,
|
||||
vad_min_silence_duration_ms: 80,
|
||||
vad_max_speech_duration_s: 25.0,
|
||||
vad_speech_pad_ms: 40,
|
||||
vad_samples_overlap: 0.08
|
||||
};
|
||||
|
||||
const result = await whisperAsync(params);
|
||||
|
||||
expect(typeof result).toBe('object');
|
||||
expect(Array.isArray(result.transcription)).toBe(true);
|
||||
expect(result.transcription.length).toBeGreaterThan(0);
|
||||
}, 30000);
|
||||
});
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cfloat>
|
||||
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
@ -51,6 +52,16 @@ struct whisper_params {
|
||||
std::vector<std::string> fname_out = {};
|
||||
|
||||
std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
|
||||
|
||||
// Voice Activity Detection (VAD) parameters
|
||||
bool vad = false;
|
||||
std::string vad_model = "";
|
||||
float vad_threshold = 0.5f;
|
||||
int vad_min_speech_duration_ms = 250;
|
||||
int vad_min_silence_duration_ms = 100;
|
||||
float vad_max_speech_duration_s = FLT_MAX;
|
||||
int vad_speech_pad_ms = 30;
|
||||
float vad_samples_overlap = 0.1f;
|
||||
};
|
||||
|
||||
struct whisper_print_user_data {
|
||||
@ -333,16 +344,16 @@ class ProgressWorker : public Napi::AsyncWorker {
|
||||
};
|
||||
wparams.progress_callback_user_data = this;
|
||||
|
||||
// Abort mechanism example
|
||||
{
|
||||
static bool is_aborted = false; // Note: this should be atomic to avoid data races
|
||||
// Set VAD parameters
|
||||
wparams.vad = params.vad;
|
||||
wparams.vad_model_path = params.vad_model.c_str();
|
||||
|
||||
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
wparams.encoder_begin_callback_user_data = &is_aborted;
|
||||
}
|
||||
wparams.vad_params.threshold = params.vad_threshold;
|
||||
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
|
||||
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
|
||||
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
|
||||
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
|
||||
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
|
||||
|
||||
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
@ -385,14 +396,46 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
std::string language = whisper_params.Get("language").As<Napi::String>();
|
||||
std::string model = whisper_params.Get("model").As<Napi::String>();
|
||||
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
|
||||
bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
|
||||
bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
|
||||
bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
|
||||
bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
|
||||
bool detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
|
||||
int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
|
||||
bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
||||
int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
|
||||
|
||||
bool use_gpu = true;
|
||||
if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) {
|
||||
use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
bool flash_attn = false;
|
||||
if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) {
|
||||
flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
bool no_prints = false;
|
||||
if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) {
|
||||
no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
bool no_timestamps = false;
|
||||
if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) {
|
||||
no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
bool detect_language = false;
|
||||
if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) {
|
||||
detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
int32_t audio_ctx = 0;
|
||||
if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) {
|
||||
audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
|
||||
}
|
||||
|
||||
bool comma_in_time = true;
|
||||
if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) {
|
||||
comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
int32_t max_len = 0;
|
||||
if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
|
||||
max_len = whisper_params.Get("max_len").As<Napi::Number>();
|
||||
}
|
||||
|
||||
// Add support for max_context
|
||||
int32_t max_context = -1;
|
||||
@ -408,7 +451,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
|
||||
// Add support for print_progress
|
||||
bool print_progress = false;
|
||||
if (whisper_params.Has("print_progress")) {
|
||||
if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) {
|
||||
print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
|
||||
}
|
||||
// Add support for progress_callback
|
||||
@ -417,6 +460,47 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
|
||||
}
|
||||
|
||||
// Add support for VAD parameters
|
||||
bool vad = false;
|
||||
if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) {
|
||||
vad = whisper_params.Get("vad").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
std::string vad_model = "";
|
||||
if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) {
|
||||
vad_model = whisper_params.Get("vad_model").As<Napi::String>();
|
||||
}
|
||||
|
||||
float vad_threshold = 0.5f;
|
||||
if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) {
|
||||
vad_threshold = whisper_params.Get("vad_threshold").As<Napi::Number>();
|
||||
}
|
||||
|
||||
int vad_min_speech_duration_ms = 250;
|
||||
if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) {
|
||||
vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As<Napi::Number>();
|
||||
}
|
||||
|
||||
int vad_min_silence_duration_ms = 100;
|
||||
if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) {
|
||||
vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As<Napi::Number>();
|
||||
}
|
||||
|
||||
float vad_max_speech_duration_s = FLT_MAX;
|
||||
if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) {
|
||||
vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As<Napi::Number>();
|
||||
}
|
||||
|
||||
int vad_speech_pad_ms = 30;
|
||||
if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) {
|
||||
vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As<Napi::Number>();
|
||||
}
|
||||
|
||||
float vad_samples_overlap = 0.1f;
|
||||
if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) {
|
||||
vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As<Napi::Number>();
|
||||
}
|
||||
|
||||
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
|
||||
std::vector<float> pcmf32_vec;
|
||||
if (pcmf32Value.IsTypedArray()) {
|
||||
@ -444,6 +528,16 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
params.prompt = prompt;
|
||||
params.detect_language = detect_language;
|
||||
|
||||
// Set VAD parameters
|
||||
params.vad = vad;
|
||||
params.vad_model = vad_model;
|
||||
params.vad_threshold = vad_threshold;
|
||||
params.vad_min_speech_duration_ms = vad_min_speech_duration_ms;
|
||||
params.vad_min_silence_duration_ms = vad_min_silence_duration_ms;
|
||||
params.vad_max_speech_duration_s = vad_max_speech_duration_s;
|
||||
params.vad_speech_pad_ms = vad_speech_pad_ms;
|
||||
params.vad_samples_overlap = vad_samples_overlap;
|
||||
|
||||
Napi::Function callback = info[1].As<Napi::Function>();
|
||||
// Create a new Worker class with progress callback support
|
||||
ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
|
||||
|
132
examples/addon.node/vad-example.js
Normal file
132
examples/addon.node/vad-example.js
Normal file
@ -0,0 +1,132 @@
|
||||
const path = require("path");
|
||||
const { whisper } = require(path.join(
|
||||
__dirname,
|
||||
"../../build/Release/addon.node"
|
||||
));
|
||||
const { promisify } = require("util");
|
||||
|
||||
const whisperAsync = promisify(whisper);
|
||||
|
||||
// Example with VAD enabled
|
||||
const vadParams = {
|
||||
language: "en",
|
||||
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
||||
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
||||
use_gpu: true,
|
||||
flash_attn: false,
|
||||
no_prints: false,
|
||||
comma_in_time: true,
|
||||
translate: false,
|
||||
no_timestamps: false,
|
||||
detect_language: false,
|
||||
audio_ctx: 0,
|
||||
max_len: 0,
|
||||
// VAD parameters
|
||||
vad: true,
|
||||
vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model
|
||||
vad_threshold: 0.5,
|
||||
vad_min_speech_duration_ms: 250,
|
||||
vad_min_silence_duration_ms: 100,
|
||||
vad_max_speech_duration_s: 30.0,
|
||||
vad_speech_pad_ms: 30,
|
||||
vad_samples_overlap: 0.1,
|
||||
progress_callback: (progress) => {
|
||||
console.log(`VAD Transcription progress: ${progress}%`);
|
||||
}
|
||||
};
|
||||
|
||||
// Example without VAD (traditional approach)
|
||||
const traditionalParams = {
|
||||
language: "en",
|
||||
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
||||
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
||||
use_gpu: true,
|
||||
flash_attn: false,
|
||||
no_prints: false,
|
||||
comma_in_time: true,
|
||||
translate: false,
|
||||
no_timestamps: false,
|
||||
detect_language: false,
|
||||
audio_ctx: 0,
|
||||
max_len: 0,
|
||||
vad: false, // Explicitly disable VAD
|
||||
progress_callback: (progress) => {
|
||||
console.log(`Traditional transcription progress: ${progress}%`);
|
||||
}
|
||||
};
|
||||
|
||||
async function runVADExample() {
|
||||
try {
|
||||
console.log("=== Whisper.cpp Node.js VAD Example ===\n");
|
||||
|
||||
// Check if VAD model exists
|
||||
const fs = require('fs');
|
||||
if (!fs.existsSync(vadParams.vad_model)) {
|
||||
console.log("⚠️ VAD model not found. Please download the VAD model first:");
|
||||
console.log(" ./models/download-vad-model.sh silero-v5.1.2");
|
||||
console.log(" Or run: python models/convert-silero-vad-to-ggml.py");
|
||||
console.log("\n Falling back to traditional transcription without VAD...\n");
|
||||
|
||||
// Run without VAD
|
||||
console.log("🎵 Running traditional transcription...");
|
||||
const traditionalResult = await whisperAsync(traditionalParams);
|
||||
console.log("\n📝 Traditional transcription result:");
|
||||
console.log(traditionalResult);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log("🎵 Running transcription with VAD enabled...");
|
||||
console.log("VAD Parameters:");
|
||||
console.log(` - Threshold: ${vadParams.vad_threshold}`);
|
||||
console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`);
|
||||
console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`);
|
||||
console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`);
|
||||
console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`);
|
||||
console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`);
|
||||
|
||||
const startTime = Date.now();
|
||||
const vadResult = await whisperAsync(vadParams);
|
||||
const vadDuration = Date.now() - startTime;
|
||||
|
||||
console.log("\n✅ VAD transcription completed!");
|
||||
console.log(`⏱️ Processing time: ${vadDuration}ms`);
|
||||
console.log("\n📝 VAD transcription result:");
|
||||
console.log(vadResult);
|
||||
|
||||
// Compare with traditional approach
|
||||
console.log("\n🔄 Running traditional transcription for comparison...");
|
||||
const traditionalStartTime = Date.now();
|
||||
const traditionalResult = await whisperAsync(traditionalParams);
|
||||
const traditionalDuration = Date.now() - traditionalStartTime;
|
||||
|
||||
console.log("\n✅ Traditional transcription completed!");
|
||||
console.log(`⏱️ Processing time: ${traditionalDuration}ms`);
|
||||
console.log("\n📝 Traditional transcription result:");
|
||||
console.log(traditionalResult);
|
||||
|
||||
// Performance comparison
|
||||
console.log("\n📊 Performance Comparison:");
|
||||
console.log(`VAD: ${vadDuration}ms`);
|
||||
console.log(`Traditional: ${traditionalDuration}ms`);
|
||||
const speedup = traditionalDuration / vadDuration;
|
||||
if (speedup > 1) {
|
||||
console.log(`🚀 VAD is ${speedup.toFixed(2)}x faster!`);
|
||||
} else {
|
||||
console.log(`ℹ️ Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error("❌ Error during transcription:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Run the example
|
||||
if (require.main === module) {
|
||||
runVADExample();
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
runVADExample,
|
||||
vadParams,
|
||||
traditionalParams
|
||||
};
|
Reference in New Issue
Block a user