whisper.cpp/bindings/ruby/ext/ruby_whisper_model.c
KITAITI Makoto 1f5fdbecb4
ruby : add VAD support, migration to Ruby's newer API (#3197)
* Add VAD models

* Extract function to normalize model path from ruby_whisper_initialize()

* Define ruby_whisper_vad_params struct

* Add VAD-related features to Whisper::Params

* Add tests for VAD-related features

* Define Whisper::VADParams

* Add Whisper::VAD::Params attributes

* Add test suite for VAD::Params

* Make older test to follow namespace change

* Add test for transcription with VAD

* Add assertion for test_vad_params

* Add signatures for VAD-related methods

* Define VAD::Params#==

* Add test for VAD::Params#==

* Fix Params#vad_params

* Add test for Params#vad_params

* Fix signature of Params#vad_params

* Use macro to define VAD::Params params

* Define VAD::Params#initialize

* Add tests for VAD::Params#initialize

* Add signature for VAD::Params.new

* Add documentation on VAD in README

* Wrap register_callbask in prepare_transcription for clear meanings

* Set whisper_params.vad_params just before transcription

* Don't touch NULL

* Define ruby_whisper_params_type

* Use TypedData_XXX for ruby_whisper_params instead of Data_XXX

* Remove unused functions

* Define rb_whisper_model_data_type

* Use TypedData_XXX for ruby_whisper_model instead of Data_XXX

* Define ruby_whisper_segment_type

* Use TypedData_XXX for ruby_whisper_segment instead of Data_XXX

* Define ruby_whisper_type

* Use TypedData_XXX for ruby_whisper instead of Data_XXX

* Qualify with const
2025-05-28 20:05:12 +09:00

233 lines
6.4 KiB
C

#include <ruby.h>
#include "ruby_whisper.h"
extern const rb_data_type_t ruby_whisper_type;
extern VALUE cModel;
static void rb_whisper_model_mark(void *p) {
ruby_whisper_model *rwm = (ruby_whisper_model *)p;
if (rwm->context) {
rb_gc_mark(rwm->context);
}
}
static size_t
ruby_whisper_model_memsize(const void *p)
{
const ruby_whisper_model *rwm = (const ruby_whisper_model *)p;
size_t size = sizeof(rwm);
if (!rwm) {
return 0;
}
return size;
}
static const rb_data_type_t rb_whisper_model_type = {
"ruby_whisper_model",
{rb_whisper_model_mark, RUBY_DEFAULT_FREE, ruby_whisper_model_memsize,},
0, 0,
0
};
static VALUE ruby_whisper_model_allocate(VALUE klass) {
ruby_whisper_model *rwm;
return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
}
VALUE rb_whisper_model_initialize(VALUE context) {
ruby_whisper_model *rwm;
const VALUE model = ruby_whisper_model_allocate(cModel);
TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
rwm->context = context;
return model;
};
/*
* call-seq:
* n_vocab -> Integer
*/
static VALUE
ruby_whisper_model_n_vocab(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_vocab(rw->context));
}
/*
* call-seq:
* n_audio_ctx -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_ctx(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
}
/*
* call-seq:
* n_audio_state -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_state(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_audio_state(rw->context));
}
/*
* call-seq:
* n_audio_head -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_head(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_audio_head(rw->context));
}
/*
* call-seq:
* n_audio_layer -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_layer(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_audio_layer(rw->context));
}
/*
* call-seq:
* n_text_ctx -> Integer
*/
static VALUE
ruby_whisper_model_n_text_ctx(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_text_ctx(rw->context));
}
/*
* call-seq:
* n_text_state -> Integer
*/
static VALUE
ruby_whisper_model_n_text_state(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_text_state(rw->context));
}
/*
* call-seq:
* n_text_head -> Integer
*/
static VALUE
ruby_whisper_model_n_text_head(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_text_head(rw->context));
}
/*
* call-seq:
* n_text_layer -> Integer
*/
static VALUE
ruby_whisper_model_n_text_layer(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_text_layer(rw->context));
}
/*
* call-seq:
* n_mels -> Integer
*/
static VALUE
ruby_whisper_model_n_mels(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_n_mels(rw->context));
}
/*
* call-seq:
* ftype -> Integer
*/
static VALUE
ruby_whisper_model_ftype(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return INT2NUM(whisper_model_ftype(rw->context));
}
/*
* call-seq:
* type -> String
*/
static VALUE
ruby_whisper_model_type(VALUE self)
{
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
return rb_str_new2(whisper_model_type_readable(rw->context));
}
void
init_ruby_whisper_model(VALUE *mWhisper)
{
cModel = rb_define_class_under(*mWhisper, "Model", rb_cObject);
rb_define_alloc_func(cModel, ruby_whisper_model_allocate);
rb_define_method(cModel, "n_vocab", ruby_whisper_model_n_vocab, 0);
rb_define_method(cModel, "n_audio_ctx", ruby_whisper_model_n_audio_ctx, 0);
rb_define_method(cModel, "n_audio_state", ruby_whisper_model_n_audio_state, 0);
rb_define_method(cModel, "n_audio_head", ruby_whisper_model_n_audio_head, 0);
rb_define_method(cModel, "n_audio_layer", ruby_whisper_model_n_audio_layer, 0);
rb_define_method(cModel, "n_text_ctx", ruby_whisper_model_n_text_ctx, 0);
rb_define_method(cModel, "n_text_state", ruby_whisper_model_n_text_state, 0);
rb_define_method(cModel, "n_text_head", ruby_whisper_model_n_text_head, 0);
rb_define_method(cModel, "n_text_layer", ruby_whisper_model_n_text_layer, 0);
rb_define_method(cModel, "n_mels", ruby_whisper_model_n_mels, 0);
rb_define_method(cModel, "ftype", ruby_whisper_model_ftype, 0);
rb_define_method(cModel, "type", ruby_whisper_model_type, 0);
}