whisper.cpp/models/convert-silero-vad-to-ggml.py
Daniel Bevenius e41bc5c61a
vad : add initial Voice Activity Detection (VAD) support (#3065)
* vad : add initial Voice Activity Detection (VAD) support

This commit add support for Voice Activity Detection (VAD). When enabled
this feature will process the audio input and detect speech segments.
This information is then used to reduce the number of samples that need
to be processed by whisper_full.

Resolves: https://github.com/ggml-org/whisper.cpp/issues/3003

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-05-12 16:10:11 +02:00

197 lines
7.1 KiB
Python

import os
import struct
import argparse
import torch
import numpy as np
from silero_vad import load_silero_vad, __version__ as silero_version
def convert_silero_vad(output_path, print_tensors=True):
model = load_silero_vad()
state_dict = model.state_dict()
# Clean up state dict keys - filter out 8k model
cleaned_dict = {}
for key, value in state_dict.items():
# Skip 8k model
if "_8k" not in key:
clean_key = key
if not key.startswith("_model."):
clean_key = "_model." + key
cleaned_dict[clean_key] = value
base, ext = os.path.splitext(output_path)
output_file = f"{base}-v{silero_version}-ggml{ext}"
print(f"Saving GGML Silero-VAD model to {output_file}")
print("\nTensor info for debugging:")
for key, tensor in cleaned_dict.items():
print(f" - {key}: {tensor.shape} ({tensor.dtype})")
print()
with open(output_file, "wb") as fout:
# Write magic and version
fout.write(struct.pack("i", 0x67676d6c))
model_type = "silero-16k"
str_len = len(model_type)
fout.write(struct.pack("i", str_len))
fout.write(model_type.encode('utf-8'))
version_parts = silero_version.split('.')
major, minor, patch = map(int, version_parts)
print(f"Version: {major}.{minor}.{patch}")
fout.write(struct.pack("i", major))
fout.write(struct.pack("i", minor))
fout.write(struct.pack("i", patch))
# Write model architecture parameters
window_size = 512
fout.write(struct.pack("i", window_size))
context_size = 64
fout.write(struct.pack("i", context_size))
n_encoder_layers = 4
fout.write(struct.pack("i", n_encoder_layers))
# Write encoder dimensions
input_channels = 129
encoder_in_channels = [input_channels, 128, 64, 64]
encoder_out_channels = [128, 64, 64, 128]
kernel_size = 3
for i in range(n_encoder_layers):
fout.write(struct.pack("i", encoder_in_channels[i]))
fout.write(struct.pack("i", encoder_out_channels[i]))
fout.write(struct.pack("i", kernel_size))
# Write LSTM dimensions
lstm_input_size = 128
lstm_hidden_size = 128
fout.write(struct.pack("i", lstm_input_size))
fout.write(struct.pack("i", lstm_hidden_size))
# Write final conv dimensions
final_conv_in = 128
final_conv_out = 1
fout.write(struct.pack("i", final_conv_in))
fout.write(struct.pack("i", final_conv_out))
# Define tensor keys to write
tensor_keys = []
# Encoder weights
for i in range(n_encoder_layers):
weight_key = f"_model.encoder.{i}.reparam_conv.weight"
bias_key = f"_model.encoder.{i}.reparam_conv.bias"
if weight_key in cleaned_dict and bias_key in cleaned_dict:
tensor_keys.append(weight_key)
tensor_keys.append(bias_key)
# LSTM weights
lstm_keys = [
"_model.decoder.rnn.weight_ih",
"_model.decoder.rnn.weight_hh",
"_model.decoder.rnn.bias_ih",
"_model.decoder.rnn.bias_hh"
]
tensor_keys.extend([k for k in lstm_keys if k in cleaned_dict])
# Final conv weights
final_keys = [
"_model.decoder.decoder.2.weight",
"_model.decoder.decoder.2.bias"
]
tensor_keys.extend([k for k in final_keys if k in cleaned_dict])
# STFT basis - add this last
stft_tensor = "_model.stft.forward_basis_buffer"
tensor_keys.append(stft_tensor)
print(f"Writing {len(tensor_keys)} tensors:")
for key in tensor_keys:
if key in cleaned_dict:
print(f" - {key}: {cleaned_dict[key].shape}")
else:
print(f" - {key}: MISSING")
# Process each tensor
for key in tensor_keys:
if key not in cleaned_dict:
print(f"Warning: Missing tensor {key}, skipping")
continue
tensor = cleaned_dict[key]
# Special handling for STFT tensor
if key == "_model.stft.forward_basis_buffer":
# Get the original numpy array without squeezing
data = tensor.detach().cpu().numpy()
# Ensure it has the expected shape
print(f"STFT tensor original shape: {data.shape}")
n_dims = 3
tensor_shape = [data.shape[2], data.shape[1], data.shape[0]]
is_conv_weight = True
else:
# For other tensors, we can use standard processing
data = tensor.detach().cpu().squeeze().numpy()
tensor_shape = list(data.shape)
# Ensure we have at most 4 dimensions for GGML
n_dims = min(len(tensor_shape), 4)
# Reverse dimensions for GGML
tensor_shape = tensor_shape[:n_dims]
tensor_shape.reverse()
# Check if this is a convolution weight tensor
is_conv_weight = "weight" in key and ("encoder" in key or "_model.decoder.decoder.2" in key)
# Convert to float16 for convolution weights
if is_conv_weight:
data = data.astype(np.float16)
ftype = 1 # float16
else:
ftype = 0 # float32
# Debug printing of tensor info
print(f"\nWriting tensor: {key}")
print(f" Original shape: {tensor.shape}")
print(f" Processed shape: {data.shape}")
print(f" GGML dimensions: {n_dims}")
print(f" GGML shape: {tensor_shape}")
print(f" Type: {'float16' if ftype == 1 else 'float32'}")
# Convert tensor name to bytes
name_bytes = key.encode('utf-8')
name_length = len(name_bytes)
# Write tensor header
fout.write(struct.pack("i", n_dims))
fout.write(struct.pack("i", name_length))
fout.write(struct.pack("i", ftype))
# Write tensor dimensions
for i in range(n_dims):
size = tensor_shape[i] if i < len(tensor_shape) else 1
fout.write(struct.pack("i", size))
print(f" Writing dimension {i}: {size}")
# Write tensor name
fout.write(name_bytes)
# Write tensor data
data.tofile(fout)
print(f" Wrote {data.size * (2 if ftype==1 else 4)} bytes")
print(f"\nDone! Model has been converted to GGML format: {output_file}")
print(f"File size: {os.path.getsize(output_file)} bytes")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
parser.add_argument("--print-tensors", action="store_true", help="Print tensor values", default=True)
args = parser.parse_args()
convert_silero_vad(args.output, args.print_tensors)