mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-16 22:20:48 +02:00
* vad : add initial Voice Activity Detection (VAD) support This commit add support for Voice Activity Detection (VAD). When enabled this feature will process the audio input and detect speech segments. This information is then used to reduce the number of samples that need to be processed by whisper_full. Resolves: https://github.com/ggml-org/whisper.cpp/issues/3003 --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
197 lines
7.1 KiB
Python
197 lines
7.1 KiB
Python
import os
|
|
import struct
|
|
import argparse
|
|
import torch
|
|
import numpy as np
|
|
from silero_vad import load_silero_vad, __version__ as silero_version
|
|
|
|
def convert_silero_vad(output_path, print_tensors=True):
|
|
model = load_silero_vad()
|
|
state_dict = model.state_dict()
|
|
|
|
# Clean up state dict keys - filter out 8k model
|
|
cleaned_dict = {}
|
|
for key, value in state_dict.items():
|
|
# Skip 8k model
|
|
if "_8k" not in key:
|
|
clean_key = key
|
|
if not key.startswith("_model."):
|
|
clean_key = "_model." + key
|
|
cleaned_dict[clean_key] = value
|
|
|
|
base, ext = os.path.splitext(output_path)
|
|
output_file = f"{base}-v{silero_version}-ggml{ext}"
|
|
print(f"Saving GGML Silero-VAD model to {output_file}")
|
|
|
|
print("\nTensor info for debugging:")
|
|
for key, tensor in cleaned_dict.items():
|
|
print(f" - {key}: {tensor.shape} ({tensor.dtype})")
|
|
print()
|
|
|
|
with open(output_file, "wb") as fout:
|
|
# Write magic and version
|
|
fout.write(struct.pack("i", 0x67676d6c))
|
|
|
|
model_type = "silero-16k"
|
|
str_len = len(model_type)
|
|
fout.write(struct.pack("i", str_len))
|
|
fout.write(model_type.encode('utf-8'))
|
|
|
|
version_parts = silero_version.split('.')
|
|
major, minor, patch = map(int, version_parts)
|
|
print(f"Version: {major}.{minor}.{patch}")
|
|
fout.write(struct.pack("i", major))
|
|
fout.write(struct.pack("i", minor))
|
|
fout.write(struct.pack("i", patch))
|
|
|
|
# Write model architecture parameters
|
|
window_size = 512
|
|
fout.write(struct.pack("i", window_size))
|
|
context_size = 64
|
|
fout.write(struct.pack("i", context_size))
|
|
|
|
n_encoder_layers = 4
|
|
fout.write(struct.pack("i", n_encoder_layers))
|
|
|
|
# Write encoder dimensions
|
|
input_channels = 129
|
|
encoder_in_channels = [input_channels, 128, 64, 64]
|
|
encoder_out_channels = [128, 64, 64, 128]
|
|
kernel_size = 3
|
|
|
|
for i in range(n_encoder_layers):
|
|
fout.write(struct.pack("i", encoder_in_channels[i]))
|
|
fout.write(struct.pack("i", encoder_out_channels[i]))
|
|
fout.write(struct.pack("i", kernel_size))
|
|
|
|
# Write LSTM dimensions
|
|
lstm_input_size = 128
|
|
lstm_hidden_size = 128
|
|
fout.write(struct.pack("i", lstm_input_size))
|
|
fout.write(struct.pack("i", lstm_hidden_size))
|
|
|
|
# Write final conv dimensions
|
|
final_conv_in = 128
|
|
final_conv_out = 1
|
|
fout.write(struct.pack("i", final_conv_in))
|
|
fout.write(struct.pack("i", final_conv_out))
|
|
|
|
# Define tensor keys to write
|
|
tensor_keys = []
|
|
|
|
# Encoder weights
|
|
for i in range(n_encoder_layers):
|
|
weight_key = f"_model.encoder.{i}.reparam_conv.weight"
|
|
bias_key = f"_model.encoder.{i}.reparam_conv.bias"
|
|
if weight_key in cleaned_dict and bias_key in cleaned_dict:
|
|
tensor_keys.append(weight_key)
|
|
tensor_keys.append(bias_key)
|
|
|
|
# LSTM weights
|
|
lstm_keys = [
|
|
"_model.decoder.rnn.weight_ih",
|
|
"_model.decoder.rnn.weight_hh",
|
|
"_model.decoder.rnn.bias_ih",
|
|
"_model.decoder.rnn.bias_hh"
|
|
]
|
|
tensor_keys.extend([k for k in lstm_keys if k in cleaned_dict])
|
|
|
|
# Final conv weights
|
|
final_keys = [
|
|
"_model.decoder.decoder.2.weight",
|
|
"_model.decoder.decoder.2.bias"
|
|
]
|
|
tensor_keys.extend([k for k in final_keys if k in cleaned_dict])
|
|
|
|
# STFT basis - add this last
|
|
stft_tensor = "_model.stft.forward_basis_buffer"
|
|
tensor_keys.append(stft_tensor)
|
|
|
|
print(f"Writing {len(tensor_keys)} tensors:")
|
|
for key in tensor_keys:
|
|
if key in cleaned_dict:
|
|
print(f" - {key}: {cleaned_dict[key].shape}")
|
|
else:
|
|
print(f" - {key}: MISSING")
|
|
|
|
# Process each tensor
|
|
for key in tensor_keys:
|
|
if key not in cleaned_dict:
|
|
print(f"Warning: Missing tensor {key}, skipping")
|
|
continue
|
|
|
|
tensor = cleaned_dict[key]
|
|
|
|
# Special handling for STFT tensor
|
|
if key == "_model.stft.forward_basis_buffer":
|
|
# Get the original numpy array without squeezing
|
|
data = tensor.detach().cpu().numpy()
|
|
# Ensure it has the expected shape
|
|
print(f"STFT tensor original shape: {data.shape}")
|
|
n_dims = 3
|
|
tensor_shape = [data.shape[2], data.shape[1], data.shape[0]]
|
|
is_conv_weight = True
|
|
else:
|
|
# For other tensors, we can use standard processing
|
|
data = tensor.detach().cpu().squeeze().numpy()
|
|
tensor_shape = list(data.shape)
|
|
|
|
# Ensure we have at most 4 dimensions for GGML
|
|
n_dims = min(len(tensor_shape), 4)
|
|
|
|
# Reverse dimensions for GGML
|
|
tensor_shape = tensor_shape[:n_dims]
|
|
tensor_shape.reverse()
|
|
|
|
# Check if this is a convolution weight tensor
|
|
is_conv_weight = "weight" in key and ("encoder" in key or "_model.decoder.decoder.2" in key)
|
|
|
|
# Convert to float16 for convolution weights
|
|
if is_conv_weight:
|
|
data = data.astype(np.float16)
|
|
ftype = 1 # float16
|
|
else:
|
|
ftype = 0 # float32
|
|
|
|
# Debug printing of tensor info
|
|
print(f"\nWriting tensor: {key}")
|
|
print(f" Original shape: {tensor.shape}")
|
|
print(f" Processed shape: {data.shape}")
|
|
print(f" GGML dimensions: {n_dims}")
|
|
print(f" GGML shape: {tensor_shape}")
|
|
print(f" Type: {'float16' if ftype == 1 else 'float32'}")
|
|
|
|
# Convert tensor name to bytes
|
|
name_bytes = key.encode('utf-8')
|
|
name_length = len(name_bytes)
|
|
|
|
# Write tensor header
|
|
fout.write(struct.pack("i", n_dims))
|
|
fout.write(struct.pack("i", name_length))
|
|
fout.write(struct.pack("i", ftype))
|
|
|
|
# Write tensor dimensions
|
|
for i in range(n_dims):
|
|
size = tensor_shape[i] if i < len(tensor_shape) else 1
|
|
fout.write(struct.pack("i", size))
|
|
print(f" Writing dimension {i}: {size}")
|
|
|
|
# Write tensor name
|
|
fout.write(name_bytes)
|
|
|
|
# Write tensor data
|
|
data.tofile(fout)
|
|
|
|
print(f" Wrote {data.size * (2 if ftype==1 else 4)} bytes")
|
|
|
|
print(f"\nDone! Model has been converted to GGML format: {output_file}")
|
|
print(f"File size: {os.path.getsize(output_file)} bytes")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
|
|
parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
|
|
parser.add_argument("--print-tensors", action="store_true", help="Print tensor values", default=True)
|
|
args = parser.parse_args()
|
|
|
|
convert_silero_vad(args.output, args.print_tensors)
|