whisper.cpp/models/convert-silero-vad-to-ggml.py

import os
import struct
import argparse
import torch
import numpy as np
from silero_vad import load_silero_vad, __version__ as silero_version

def convert_silero_vad(output_path, print_tensors=True):
    model = load_silero_vad()
    state_dict = model.state_dict()

    # Clean up state dict keys - filter out 8k model
    cleaned_dict = {}
    for key, value in state_dict.items():
        # Skip 8k model
        if "_8k" not in key:
            clean_key = key
            if not key.startswith("_model."):
                clean_key = "_model." + key
            cleaned_dict[clean_key] = value

    base, ext = os.path.splitext(output_path)
    output_file = f"{base}-v{silero_version}-ggml{ext}"
    print(f"Saving GGML Silero-VAD model to {output_file}")

    print("\nTensor info for debugging:")
    for key, tensor in cleaned_dict.items():
        print(f"  - {key}: {tensor.shape} ({tensor.dtype})")
    print()

    with open(output_file, "wb") as fout:
        # Write magic and version
        fout.write(struct.pack("i", 0x67676d6c))

        model_type = "silero-16k"
        str_len = len(model_type)
        fout.write(struct.pack("i", str_len))
        fout.write(model_type.encode('utf-8'))

        version_parts = silero_version.split('.')
        major, minor, patch = map(int, version_parts)
        print(f"Version: {major}.{minor}.{patch}")
        fout.write(struct.pack("i", major))
        fout.write(struct.pack("i", minor))
        fout.write(struct.pack("i", patch))

        # Write model architecture parameters
        window_size = 512
        fout.write(struct.pack("i", window_size))
        context_size = 64
        fout.write(struct.pack("i", context_size))

        n_encoder_layers = 4
        fout.write(struct.pack("i", n_encoder_layers))

        # Write encoder dimensions
        input_channels = 129
        encoder_in_channels = [input_channels, 128, 64, 64]
        encoder_out_channels = [128, 64, 64, 128]
        kernel_size = 3

        for i in range(n_encoder_layers):
            fout.write(struct.pack("i", encoder_in_channels[i]))
            fout.write(struct.pack("i", encoder_out_channels[i]))
            fout.write(struct.pack("i", kernel_size))

        # Write LSTM dimensions
        lstm_input_size = 128
        lstm_hidden_size = 128
        fout.write(struct.pack("i", lstm_input_size))
        fout.write(struct.pack("i", lstm_hidden_size))

        # Write final conv dimensions
        final_conv_in = 128
        final_conv_out = 1
        fout.write(struct.pack("i", final_conv_in))
        fout.write(struct.pack("i", final_conv_out))

        # Define tensor keys to write
        tensor_keys = []

        # Encoder weights
        for i in range(n_encoder_layers):
            weight_key = f"_model.encoder.{i}.reparam_conv.weight"
            bias_key = f"_model.encoder.{i}.reparam_conv.bias"
            if weight_key in cleaned_dict and bias_key in cleaned_dict:
                tensor_keys.append(weight_key)
                tensor_keys.append(bias_key)

        # LSTM weights
        lstm_keys = [
            "_model.decoder.rnn.weight_ih",
            "_model.decoder.rnn.weight_hh",
            "_model.decoder.rnn.bias_ih",
            "_model.decoder.rnn.bias_hh"
        ]
        tensor_keys.extend([k for k in lstm_keys if k in cleaned_dict])

        # Final conv weights
        final_keys = [
            "_model.decoder.decoder.2.weight",
            "_model.decoder.decoder.2.bias"
        ]
        tensor_keys.extend([k for k in final_keys if k in cleaned_dict])

        # STFT basis - add this last
        stft_tensor = "_model.stft.forward_basis_buffer"
        tensor_keys.append(stft_tensor)

        print(f"Writing {len(tensor_keys)} tensors:")
        for key in tensor_keys:
            if key in cleaned_dict:
                print(f"  - {key}: {cleaned_dict[key].shape}")
            else:
                print(f"  - {key}: MISSING")

        # Process each tensor
        for key in tensor_keys:
            if key not in cleaned_dict:
                print(f"Warning: Missing tensor {key}, skipping")
                continue

            tensor = cleaned_dict[key]

            # Special handling for STFT tensor
            if key == "_model.stft.forward_basis_buffer":
                # Get the original numpy array without squeezing
                data = tensor.detach().cpu().numpy()
                # Ensure it has the expected shape
                print(f"STFT tensor original shape: {data.shape}")
                n_dims = 3
                tensor_shape = [data.shape[2], data.shape[1], data.shape[0]]
                is_conv_weight = True
            else:
                # For other tensors, we can use standard processing
                data = tensor.detach().cpu().squeeze().numpy()
                tensor_shape = list(data.shape)

                # Ensure we have at most 4 dimensions for GGML
                n_dims = min(len(tensor_shape), 4)

                # Reverse dimensions for GGML
                tensor_shape = tensor_shape[:n_dims]
                tensor_shape.reverse()

                # Check if this is a convolution weight tensor
                is_conv_weight = "weight" in key and ("encoder" in key or "_model.decoder.decoder.2" in key)

            # Convert to float16 for convolution weights
            if is_conv_weight:
                data = data.astype(np.float16)
                ftype = 1  # float16
            else:
                ftype = 0  # float32

            # Debug printing of tensor info
            print(f"\nWriting tensor: {key}")
            print(f"  Original shape: {tensor.shape}")
            print(f"  Processed shape: {data.shape}")
            print(f"  GGML dimensions: {n_dims}")
            print(f"  GGML shape: {tensor_shape}")
            print(f"  Type: {'float16' if ftype == 1 else 'float32'}")

            # Convert tensor name to bytes
            name_bytes = key.encode('utf-8')
            name_length = len(name_bytes)

            # Write tensor header
            fout.write(struct.pack("i", n_dims))
            fout.write(struct.pack("i", name_length))
            fout.write(struct.pack("i", ftype))

            # Write tensor dimensions
            for i in range(n_dims):
                size = tensor_shape[i] if i < len(tensor_shape) else 1
                fout.write(struct.pack("i", size))
                print(f"  Writing dimension {i}: {size}")

            # Write tensor name
            fout.write(name_bytes)

            # Write tensor data
            data.tofile(fout)

            print(f"  Wrote {data.size * (2 if ftype==1 else 4)} bytes")

    print(f"\nDone! Model has been converted to GGML format: {output_file}")
    print(f"File size: {os.path.getsize(output_file)} bytes")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
    parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
    parser.add_argument("--print-tensors", action="store_true", help="Print tensor values", default=True)
    args = parser.parse_args()

    convert_silero_vad(args.output, args.print_tensors)