ruby : add Core ML support (#3214)

* Prevent overflow

* Fix memsize of Whisper::Context

* Rename xxx_initialize to more Ruby-esque name: xxx_s_new

* Define Whisper::Model::ZipURI

* Define Whisper::Model.coreml_compiled_models

* Make Options' @cmake_options Hash

* Use --{enable,disable}-whisper-coreml option for -I/opt/homebrew/opt/llvm/include

* Prepare Core ML model if enabled

* Add test for ZipURI

* Add signatures for ZipURI

* Add Whisper.system_info_str

* Add test for Whisper.system_info_str

* Add signagure for Model.coreml_compiled_models

* Add signature for Whisper.system_info_str

* Add test for Core ML

* Update date

* Maintain .gitignore
This commit is contained in:
KITAITI Makoto
2025-06-01 18:16:02 +09:00
committed by GitHub
parent 98dfe8dc26
commit 0251445005
14 changed files with 175 additions and 40 deletions

View File

@ -2,10 +2,8 @@ Makefile
whisper.so
whisper.bundle
whisper.dll
scripts/get-flags.mk
*.o
/*/**/*.c
/*/**/*.cpp
/*/**/*.h
/*/**/*.m
/*/**/*.metal
*.a
sources/*
!sources/CMakeGraphVizOptions.cmake
mkmf.log

View File

@ -20,27 +20,39 @@ class Options
Dir.chdir __dir__ do
output = `#{@cmake.shellescape} -S sources -B build -L`
end
started = false
@cmake_options = output.lines.filter_map {|line|
if line.chomp == "-- Cache values"
started = true
next
end
next unless started
option, value = line.chomp.split("=", 2)
name, type = option.split(":", 2)
[name, type, value]
}
@cmake_options = output.lines.drop_while {|line| line.chomp != "-- Cache values"}.drop(1)
.filter_map {|line|
option, value = line.chomp.split("=", 2)
name, type = option.split(":", 2)
[
name,
[
type,
type == "BOOL" ? value == "ON" : value
]
]
}.to_h
end
private
def configure
cmake_options.each do |name, type, default_value|
cmake_options.each_pair do |name, (type, default_value)|
option = option_name(name)
value = type == "BOOL" ? enable_config(option) : arg_config("--#{option}")
@options[name] = [type, value]
end
configure_coreml
end
def configure_coreml
use_coreml = if @options["WHISPER_COREML"][1].nil?
cmake_options["WHISPER_COREML"][1]
else
@options["WHISPER_COREML"][1]
end
$CPPFLAGS << " -DRUBY_WHISPER_USE_COREML" if use_coreml
end
def option_name(name)

View File

@ -22,6 +22,8 @@ ID id_new;
ID id_to_path;
ID id_URI;
ID id_pre_converted_models;
ID id_coreml_compiled_models;
ID id_cache;
static bool is_log_callback_finalized = false;
@ -83,6 +85,14 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
return rb_str_new2(str_full);
}
/*
* call-seq:
* system_info_str -> String
*/
static VALUE ruby_whisper_s_system_info_str(VALUE self) {
return rb_str_new2(whisper_print_system_info());
}
static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
is_log_callback_finalized = true;
return Qnil;
@ -130,6 +140,8 @@ void Init_whisper() {
id_to_path = rb_intern("to_path");
id_URI = rb_intern("URI");
id_pre_converted_models = rb_intern("pre_converted_models");
id_coreml_compiled_models = rb_intern("coreml_compiled_models");
id_cache = rb_intern("cache");
mWhisper = rb_define_module("Whisper");
mVAD = rb_define_module_under(mWhisper, "VAD");
@ -145,6 +157,7 @@ void Init_whisper() {
rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
rb_define_singleton_method(mWhisper, "system_info_str", ruby_whisper_s_system_info_str, 0);
rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);

View File

@ -11,6 +11,8 @@ extern ID id_new;
extern ID id_to_path;
extern ID id_URI;
extern ID id_pre_converted_models;
extern ID id_coreml_compiled_models;
extern ID id_cache;
extern VALUE cContext;
extern VALUE eError;
@ -18,8 +20,8 @@ extern VALUE cModel;
extern const rb_data_type_t ruby_whisper_params_type;
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
extern VALUE rb_whisper_model_initialize(VALUE context);
extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
extern VALUE rb_whisper_model_s_new(VALUE context);
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
static void
@ -53,6 +55,9 @@ ruby_whisper_memsize(const void *p)
if (!rw) {
return 0;
}
if (rw->context) {
size += sizeof(rw->context);
}
return size;
}
@ -79,6 +84,13 @@ ruby_whisper_normalize_model_path(VALUE model_path)
VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
if (!NIL_P(pre_converted_model)) {
model_path = pre_converted_model;
#ifdef RUBY_WHISPER_USE_COREML
VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0);
VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model);
if (!NIL_P(coreml_converted_model)) {
rb_funcall(coreml_converted_model, id_cache, 0);
}
#endif
}
else if (TYPE(model_path) == T_STRING) {
const char * model_path_str = StringValueCStr(model_path);
@ -293,13 +305,20 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
// Should check when samples.respond_to?(:length)?
} else {
if (TYPE(samples) == T_ARRAY) {
n_samples = RARRAY_LEN(samples);
if (RARRAY_LEN(samples) > INT_MAX) {
rb_raise(rb_eArgError, "samples are too long");
}
n_samples = (int)RARRAY_LEN(samples);
} else if (memory_view_available_p) {
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
view.obj = Qnil;
rb_raise(rb_eArgError, "unable to get a memory view");
}
n_samples = view.byte_size / view.item_size;
ssize_t n_samples_size = view.byte_size / view.item_size;
if (n_samples_size > INT_MAX) {
rb_raise(rb_eArgError, "samples are too long");
}
n_samples = (int)n_samples_size;
} else if (rb_respond_to(samples, id_length)) {
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
} else {
@ -387,10 +406,17 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
view.obj = Qnil;
rb_raise(rb_eArgError, "unable to get a memory view");
}
n_samples = view.byte_size / view.item_size;
ssize_t n_samples_size = view.byte_size / view.item_size;
if (n_samples_size > INT_MAX) {
rb_raise(rb_eArgError, "samples are too long");
}
n_samples = (int)n_samples_size;
} else {
if (TYPE(samples) == T_ARRAY) {
n_samples = RARRAY_LEN(samples);
if (RARRAY_LEN(samples) > INT_MAX) {
rb_raise(rb_eArgError, "samples are too long");
}
n_samples = (int)RARRAY_LEN(samples);
} else if (rb_respond_to(samples, id_length)) {
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
} else {
@ -476,7 +502,7 @@ ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
return INT2NUM(t0);
return LONG2NUM(t0);
}
/*
@ -494,7 +520,7 @@ ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
return INT2NUM(t1);
return LONG2NUM(t1);
}
/*
@ -552,7 +578,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
static VALUE
ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
{
return rb_whisper_segment_initialize(self, NUM2INT(i_segment));
return rb_whisper_segment_s_new(self, NUM2INT(i_segment));
}
/*
@ -586,7 +612,7 @@ ruby_whisper_each_segment(VALUE self)
const int n_segments = whisper_full_n_segments(rw->context);
for (int i = 0; i < n_segments; ++i) {
rb_yield(rb_whisper_segment_initialize(self, i));
rb_yield(rb_whisper_segment_s_new(self, i));
}
return self;
@ -599,7 +625,7 @@ ruby_whisper_each_segment(VALUE self)
static VALUE
ruby_whisper_get_model(VALUE self)
{
return rb_whisper_model_initialize(self);
return rb_whisper_model_s_new(self);
}
void

View File

@ -35,7 +35,7 @@ static VALUE ruby_whisper_model_allocate(VALUE klass) {
return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
}
VALUE rb_whisper_model_initialize(VALUE context) {
VALUE rb_whisper_model_s_new(VALUE context) {
ruby_whisper_model *rwm;
const VALUE model = ruby_whisper_model_allocate(cModel);
TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);

View File

@ -34,7 +34,7 @@ extern VALUE cVADParams;
extern ID id_call;
extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
extern const rb_data_type_t ruby_whisper_vad_params_type;
static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
@ -110,7 +110,7 @@ static void new_segment_callback(struct whisper_context *ctx, struct whisper_sta
const int n_segments = whisper_full_n_segments_from_state(state);
for (int i = n_new; i > 0; i--) {
int i_segment = n_segments - i;
VALUE segment = rb_whisper_segment_initialize(*container->context, i_segment);
VALUE segment = rb_whisper_segment_s_new(*container->context, i_segment);
for (int j = 0; j < callbacks_len; j++) {
VALUE cb = rb_ary_entry(container->callbacks, j);
rb_funcall(cb, id_call, 1, segment);

View File

@ -38,7 +38,7 @@ ruby_whisper_segment_allocate(VALUE klass)
}
VALUE
rb_whisper_segment_initialize(VALUE context, int index)
rb_whisper_segment_s_new(VALUE context, int index)
{
ruby_whisper_segment *rws;
const VALUE segment = ruby_whisper_segment_allocate(cSegment);
@ -63,7 +63,7 @@ ruby_whisper_segment_get_start_time(VALUE self)
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
return INT2NUM(t0 * 10);
return LONG2NUM(t0 * 10);
}
/*
@ -81,7 +81,7 @@ ruby_whisper_segment_get_end_time(VALUE self)
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
return INT2NUM(t1 * 10);
return LONG2NUM(t1 * 10);
}
/*