From 0251445005a3ea9db60bd5f47a3470f08a2414ca Mon Sep 17 00:00:00 2001 From: KITAITI Makoto Date: Sun, 1 Jun 2025 18:16:02 +0900 Subject: [PATCH] ruby : add Core ML support (#3214) * Prevent overflow * Fix memsize of Whisper::Context * Rename xxx_initialize to more Ruby-esque name: xxx_s_new * Define Whisper::Model::ZipURI * Define Whisper::Model.coreml_compiled_models * Make Options' @cmake_options Hash * Use --{enable,disable}-whisper-coreml option for -I/opt/homebrew/opt/llvm/include * Prepare Core ML model if enabled * Add test for ZipURI * Add signatures for ZipURI * Add Whisper.system_info_str * Add test for Whisper.system_info_str * Add signagure for Model.coreml_compiled_models * Add signature for Whisper.system_info_str * Add test for Core ML * Update date * Maintain .gitignore --- bindings/ruby/.gitignore | 3 -- bindings/ruby/ext/.gitignore | 10 ++--- bindings/ruby/ext/options.rb | 36 ++++++++++----- bindings/ruby/ext/ruby_whisper.c | 13 ++++++ bindings/ruby/ext/ruby_whisper_context.c | 48 +++++++++++++++----- bindings/ruby/ext/ruby_whisper_model.c | 2 +- bindings/ruby/ext/ruby_whisper_params.c | 4 +- bindings/ruby/ext/ruby_whisper_segment.c | 6 +-- bindings/ruby/lib/whisper/model/uri.rb | 57 +++++++++++++++++++++++- bindings/ruby/sig/whisper.rbs | 7 +++ bindings/ruby/tests/test_model.rb | 9 ++++ bindings/ruby/tests/test_package.rb | 14 ++++++ bindings/ruby/tests/test_whisper.rb | 4 ++ bindings/ruby/whispercpp.gemspec | 2 +- 14 files changed, 175 insertions(+), 40 deletions(-) diff --git a/bindings/ruby/.gitignore b/bindings/ruby/.gitignore index e93e6fac..e04a90a9 100644 --- a/bindings/ruby/.gitignore +++ b/bindings/ruby/.gitignore @@ -1,6 +1,3 @@ LICENSE pkg/ lib/whisper.* -ext/sources/* -!ext/sources/CMakeGraphVizOptions.cmake -ext/mkmf.log diff --git a/bindings/ruby/ext/.gitignore b/bindings/ruby/ext/.gitignore index 7703146f..6fd36e40 100644 --- a/bindings/ruby/ext/.gitignore +++ b/bindings/ruby/ext/.gitignore @@ -2,10 +2,8 @@ Makefile whisper.so whisper.bundle whisper.dll -scripts/get-flags.mk *.o -/*/**/*.c -/*/**/*.cpp -/*/**/*.h -/*/**/*.m -/*/**/*.metal +*.a +sources/* +!sources/CMakeGraphVizOptions.cmake +mkmf.log diff --git a/bindings/ruby/ext/options.rb b/bindings/ruby/ext/options.rb index 46408ae8..03648fbf 100644 --- a/bindings/ruby/ext/options.rb +++ b/bindings/ruby/ext/options.rb @@ -20,27 +20,39 @@ class Options Dir.chdir __dir__ do output = `#{@cmake.shellescape} -S sources -B build -L` end - started = false - @cmake_options = output.lines.filter_map {|line| - if line.chomp == "-- Cache values" - started = true - next - end - next unless started - option, value = line.chomp.split("=", 2) - name, type = option.split(":", 2) - [name, type, value] - } + @cmake_options = output.lines.drop_while {|line| line.chomp != "-- Cache values"}.drop(1) + .filter_map {|line| + option, value = line.chomp.split("=", 2) + name, type = option.split(":", 2) + [ + name, + [ + type, + type == "BOOL" ? value == "ON" : value + ] + ] + }.to_h end private def configure - cmake_options.each do |name, type, default_value| + cmake_options.each_pair do |name, (type, default_value)| option = option_name(name) value = type == "BOOL" ? enable_config(option) : arg_config("--#{option}") @options[name] = [type, value] end + + configure_coreml + end + + def configure_coreml + use_coreml = if @options["WHISPER_COREML"][1].nil? + cmake_options["WHISPER_COREML"][1] + else + @options["WHISPER_COREML"][1] + end + $CPPFLAGS << " -DRUBY_WHISPER_USE_COREML" if use_coreml end def option_name(name) diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c index 4a83aac9..e88aa29c 100644 --- a/bindings/ruby/ext/ruby_whisper.c +++ b/bindings/ruby/ext/ruby_whisper.c @@ -22,6 +22,8 @@ ID id_new; ID id_to_path; ID id_URI; ID id_pre_converted_models; +ID id_coreml_compiled_models; +ID id_cache; static bool is_log_callback_finalized = false; @@ -83,6 +85,14 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) { return rb_str_new2(str_full); } +/* + * call-seq: + * system_info_str -> String + */ +static VALUE ruby_whisper_s_system_info_str(VALUE self) { + return rb_str_new2(whisper_print_system_info()); +} + static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) { is_log_callback_finalized = true; return Qnil; @@ -130,6 +140,8 @@ void Init_whisper() { id_to_path = rb_intern("to_path"); id_URI = rb_intern("URI"); id_pre_converted_models = rb_intern("pre_converted_models"); + id_coreml_compiled_models = rb_intern("coreml_compiled_models"); + id_cache = rb_intern("cache"); mWhisper = rb_define_module("Whisper"); mVAD = rb_define_module_under(mWhisper, "VAD"); @@ -145,6 +157,7 @@ void Init_whisper() { rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1); rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1); rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1); + rb_define_singleton_method(mWhisper, "system_info_str", ruby_whisper_s_system_info_str, 0); rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2); rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1); diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index c498184e..75aa8dc9 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -11,6 +11,8 @@ extern ID id_new; extern ID id_to_path; extern ID id_URI; extern ID id_pre_converted_models; +extern ID id_coreml_compiled_models; +extern ID id_cache; extern VALUE cContext; extern VALUE eError; @@ -18,8 +20,8 @@ extern VALUE cModel; extern const rb_data_type_t ruby_whisper_params_type; extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self); -extern VALUE rb_whisper_model_initialize(VALUE context); -extern VALUE rb_whisper_segment_initialize(VALUE context, int index); +extern VALUE rb_whisper_model_s_new(VALUE context); +extern VALUE rb_whisper_segment_s_new(VALUE context, int index); extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context); static void @@ -53,6 +55,9 @@ ruby_whisper_memsize(const void *p) if (!rw) { return 0; } + if (rw->context) { + size += sizeof(rw->context); + } return size; } @@ -79,6 +84,13 @@ ruby_whisper_normalize_model_path(VALUE model_path) VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path); if (!NIL_P(pre_converted_model)) { model_path = pre_converted_model; +#ifdef RUBY_WHISPER_USE_COREML + VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0); + VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model); + if (!NIL_P(coreml_converted_model)) { + rb_funcall(coreml_converted_model, id_cache, 0); + } +#endif } else if (TYPE(model_path) == T_STRING) { const char * model_path_str = StringValueCStr(model_path); @@ -293,13 +305,20 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) // Should check when samples.respond_to?(:length)? } else { if (TYPE(samples) == T_ARRAY) { - n_samples = RARRAY_LEN(samples); + if (RARRAY_LEN(samples) > INT_MAX) { + rb_raise(rb_eArgError, "samples are too long"); + } + n_samples = (int)RARRAY_LEN(samples); } else if (memory_view_available_p) { if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) { view.obj = Qnil; rb_raise(rb_eArgError, "unable to get a memory view"); } - n_samples = view.byte_size / view.item_size; + ssize_t n_samples_size = view.byte_size / view.item_size; + if (n_samples_size > INT_MAX) { + rb_raise(rb_eArgError, "samples are too long"); + } + n_samples = (int)n_samples_size; } else if (rb_respond_to(samples, id_length)) { n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); } else { @@ -387,10 +406,17 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) view.obj = Qnil; rb_raise(rb_eArgError, "unable to get a memory view"); } - n_samples = view.byte_size / view.item_size; + ssize_t n_samples_size = view.byte_size / view.item_size; + if (n_samples_size > INT_MAX) { + rb_raise(rb_eArgError, "samples are too long"); + } + n_samples = (int)n_samples_size; } else { if (TYPE(samples) == T_ARRAY) { - n_samples = RARRAY_LEN(samples); + if (RARRAY_LEN(samples) > INT_MAX) { + rb_raise(rb_eArgError, "samples are too long"); + } + n_samples = (int)RARRAY_LEN(samples); } else if (rb_respond_to(samples, id_length)) { n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); } else { @@ -476,7 +502,7 @@ ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment) TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw); const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment); const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment); - return INT2NUM(t0); + return LONG2NUM(t0); } /* @@ -494,7 +520,7 @@ ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment) TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw); const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment); const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment); - return INT2NUM(t1); + return LONG2NUM(t1); } /* @@ -552,7 +578,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment) static VALUE ruby_whisper_full_get_segment(VALUE self, VALUE i_segment) { - return rb_whisper_segment_initialize(self, NUM2INT(i_segment)); + return rb_whisper_segment_s_new(self, NUM2INT(i_segment)); } /* @@ -586,7 +612,7 @@ ruby_whisper_each_segment(VALUE self) const int n_segments = whisper_full_n_segments(rw->context); for (int i = 0; i < n_segments; ++i) { - rb_yield(rb_whisper_segment_initialize(self, i)); + rb_yield(rb_whisper_segment_s_new(self, i)); } return self; @@ -599,7 +625,7 @@ ruby_whisper_each_segment(VALUE self) static VALUE ruby_whisper_get_model(VALUE self) { - return rb_whisper_model_initialize(self); + return rb_whisper_model_s_new(self); } void diff --git a/bindings/ruby/ext/ruby_whisper_model.c b/bindings/ruby/ext/ruby_whisper_model.c index 54763c92..c6f3351e 100644 --- a/bindings/ruby/ext/ruby_whisper_model.c +++ b/bindings/ruby/ext/ruby_whisper_model.c @@ -35,7 +35,7 @@ static VALUE ruby_whisper_model_allocate(VALUE klass) { return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm); } -VALUE rb_whisper_model_initialize(VALUE context) { +VALUE rb_whisper_model_s_new(VALUE context) { ruby_whisper_model *rwm; const VALUE model = ruby_whisper_model_allocate(cModel); TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm); diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c index 624e0080..71337c81 100644 --- a/bindings/ruby/ext/ruby_whisper_params.c +++ b/bindings/ruby/ext/ruby_whisper_params.c @@ -34,7 +34,7 @@ extern VALUE cVADParams; extern ID id_call; extern VALUE ruby_whisper_normalize_model_path(VALUE model_path); -extern VALUE rb_whisper_segment_initialize(VALUE context, int index); +extern VALUE rb_whisper_segment_s_new(VALUE context, int index); extern const rb_data_type_t ruby_whisper_vad_params_type; static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT]; @@ -110,7 +110,7 @@ static void new_segment_callback(struct whisper_context *ctx, struct whisper_sta const int n_segments = whisper_full_n_segments_from_state(state); for (int i = n_new; i > 0; i--) { int i_segment = n_segments - i; - VALUE segment = rb_whisper_segment_initialize(*container->context, i_segment); + VALUE segment = rb_whisper_segment_s_new(*container->context, i_segment); for (int j = 0; j < callbacks_len; j++) { VALUE cb = rb_ary_entry(container->callbacks, j); rb_funcall(cb, id_call, 1, segment); diff --git a/bindings/ruby/ext/ruby_whisper_segment.c b/bindings/ruby/ext/ruby_whisper_segment.c index 9399f286..ce54a52d 100644 --- a/bindings/ruby/ext/ruby_whisper_segment.c +++ b/bindings/ruby/ext/ruby_whisper_segment.c @@ -38,7 +38,7 @@ ruby_whisper_segment_allocate(VALUE klass) } VALUE -rb_whisper_segment_initialize(VALUE context, int index) +rb_whisper_segment_s_new(VALUE context, int index) { ruby_whisper_segment *rws; const VALUE segment = ruby_whisper_segment_allocate(cSegment); @@ -63,7 +63,7 @@ ruby_whisper_segment_get_start_time(VALUE self) TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw); const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index); // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it - return INT2NUM(t0 * 10); + return LONG2NUM(t0 * 10); } /* @@ -81,7 +81,7 @@ ruby_whisper_segment_get_end_time(VALUE self) TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw); const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index); // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it - return INT2NUM(t1 * 10); + return LONG2NUM(t1 * 10); } /* diff --git a/bindings/ruby/lib/whisper/model/uri.rb b/bindings/ruby/lib/whisper/model/uri.rb index fb3ee5db..31b608ac 100644 --- a/bindings/ruby/lib/whisper/model/uri.rb +++ b/bindings/ruby/lib/whisper/model/uri.rb @@ -130,6 +130,44 @@ module Whisper end end + class ZipURI < URI + def cache + zip_path = Pathname(super) + dest = unzipped_path + return if dest.exist? && dest.mtime >= zip_path.mtime + escaping dest do + system "unzip", "-q", "-d", zip_path.dirname.to_path, zip_path.to_path, exception: true + end + zip_path.to_path + end + + def clear_cache + super + unzipped_path.rmtree if unzipped_path.exist? + end + + private + + def unzipped_path + cache_path.sub_ext("") + end + + def escaping(path) + escaped = Pathname("#{path}.removing") + if path.exist? + escaped.rmtree if escaped.exist? + path.rename escaped + end + yield + ensure + if path.exist? + escaped.rmtree if escaped.exist? + else + escaped.rename path if escaped.exist? + end + end + end + @pre_converted_models = %w[ tiny tiny.en @@ -171,8 +209,25 @@ module Whisper @pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin") end + @coreml_compiled_models = %w[ + tiny + tiny.en + base + base.en + small + small.en + medium + medium.en + large-v1 + large-v2 + large-v3 + large-v3-turbo + ].each_with_object({}) do |name, models| + models[@pre_converted_models[name]] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip") + end + class << self - attr_reader :pre_converted_models + attr_reader :pre_converted_models, :coreml_compiled_models end end end diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index c1373c87..6f8be29a 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -22,6 +22,7 @@ module Whisper def self.lang_str: (Integer id) -> String def self.lang_str_full: (Integer id) -> String def self.log_set: (log_callback, Object? user_data) -> log_callback + def self.system_info_str: () -> String class Context def self.new: (path | ::URI::HTTP) -> instance @@ -386,6 +387,7 @@ module Whisper class Model def self.pre_converted_models: () -> Hash[String, Model::URI] + def self.coreml_compiled_models: () -> Hash[Model::URI, Model::ZipURI] def self.new: () -> instance def n_vocab: () -> Integer def n_audio_ctx: () -> Integer @@ -405,6 +407,11 @@ module Whisper def to_path: -> String def clear_cache: -> void end + + class ZipURI < URI + def cache: () -> String + def clear_cache: () -> void + end end class Segment diff --git a/bindings/ruby/tests/test_model.rb b/bindings/ruby/tests/test_model.rb index df871e0e..5648fc3f 100644 --- a/bindings/ruby/tests/test_model.rb +++ b/bindings/ruby/tests/test_model.rb @@ -106,4 +106,13 @@ class TestModel < TestBase assert_equal 1, model.ftype assert_equal "base", model.type end + + def test_coreml_model_auto_download + uri = Whisper::Model.coreml_compiled_models[Whisper::Model.pre_converted_models["tiny"]] + model_path = Pathname(uri.to_path).sub_ext("") + model_path.rmtree if model_path.exist? + + uri.cache + assert_path_exist model_path + end end diff --git a/bindings/ruby/tests/test_package.rb b/bindings/ruby/tests/test_package.rb index be0bbe87..33cd2a3c 100644 --- a/bindings/ruby/tests/test_package.rb +++ b/bindings/ruby/tests/test_package.rb @@ -25,6 +25,20 @@ class TestPackage < TestBase end end + def test_install_with_coreml + omit_unless RUBY_PLATFORM.match?(/darwin/) do + gemspec = Gem::Specification.load("whispercpp.gemspec") + Dir.mktmpdir do |dir| + system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", "--", "--enable-whisper-coreml", exception: true + assert_installed dir, gemspec.version + assert_nothing_raised do + libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib") + system "ruby", "-I", libdir, "-r", "whisper", "-e", "Whisper::Context.new('tiny')", exception: true + end + end + end + end + private def assert_installed(dir, version) diff --git a/bindings/ruby/tests/test_whisper.rb b/bindings/ruby/tests/test_whisper.rb index 2754ab06..d915041f 100644 --- a/bindings/ruby/tests/test_whisper.rb +++ b/bindings/ruby/tests/test_whisper.rb @@ -94,6 +94,10 @@ class TestWhisper < TestBase end end + def test_system_info_str + assert_match /\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str + end + def test_log_set user_data = Object.new logs = [] diff --git a/bindings/ruby/whispercpp.gemspec b/bindings/ruby/whispercpp.gemspec index 59d65482..06bef943 100644 --- a/bindings/ruby/whispercpp.gemspec +++ b/bindings/ruby/whispercpp.gemspec @@ -4,7 +4,7 @@ Gem::Specification.new do |s| s.name = "whispercpp" s.authors = ["Georgi Gerganov", "Todd A. Fisher"] s.version = '1.3.3' - s.date = '2025-05-29' + s.date = '2025-06-01' s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby} s.email = 'todd.fisher@gmail.com' s.extra_rdoc_files = ['LICENSE', 'README.md']