KitaitiMakoto commited on
Commit
acad667
Β·
unverified Β·
1 Parent(s): 17ba7f5

ruby : Add parallel transcription support (#3222)

Browse files

* Fix indentation of code sample in document comment

* Make Whisper::Context#transcribe able to run non-parallel

* Add test for Whisper::Context#transcribe with parallel option

* Follow signature API change of Context#transcribe

* Remove useless variable assignment

* Move simple usage up in README

* Add need help section in README

* Add document on Context#transcribe's parallel option in README

* Update date

* Fix signature of Context.new

* Make Context#subscribe accept n_processors option

* Make test follow #transcribe's change

* Make RBS follow #transcribe's change

* Add document for #transcribe's n_processors option

* Rename test directory so that Rake tasks' default setting is used

bindings/ruby/README.md CHANGED
@@ -70,17 +70,6 @@ end
70
 
71
  Some models are prepared up-front:
72
 
73
- ```ruby
74
- base_en = Whisper::Model.pre_converted_models["base.en"]
75
- whisper = Whisper::Context.new(base_en)
76
- ```
77
-
78
- At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
79
-
80
- ```ruby
81
- Whisper::Model.pre_converted_models["base"].clear_cache
82
- ```
83
-
84
  You also can use shorthand for pre-converted models:
85
 
86
  ```ruby
@@ -105,6 +94,19 @@ puts Whisper::Model.pre_converted_models.keys
105
  # :
106
  ```
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  You can also use local model files you prepared:
109
 
110
  ```ruby
@@ -163,6 +165,16 @@ For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisp
163
  API
164
  ---
165
 
 
 
 
 
 
 
 
 
 
 
166
  ### Segments ###
167
 
168
  Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
@@ -297,6 +309,11 @@ First call of `rake test` builds an extension and downloads a model for testing.
297
 
298
  If something seems wrong on build, running `rake clean` solves some cases.
299
 
 
 
 
 
 
300
  License
301
  -------
302
 
 
70
 
71
  Some models are prepared up-front:
72
 
 
 
 
 
 
 
 
 
 
 
 
73
  You also can use shorthand for pre-converted models:
74
 
75
  ```ruby
 
94
  # :
95
  ```
96
 
97
+ You can also retrieve each model:
98
+
99
+ ```ruby
100
+ base_en = Whisper::Model.pre_converted_models["base.en"]
101
+ whisper = Whisper::Context.new(base_en)
102
+ ```
103
+
104
+ At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
105
+
106
+ ```ruby
107
+ Whisper::Model.pre_converted_models["base"].clear_cache
108
+ ```
109
+
110
  You can also use local model files you prepared:
111
 
112
  ```ruby
 
165
  API
166
  ---
167
 
168
+ ### Transcription ###
169
+
170
+ By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option:
171
+
172
+ ```ruby
173
+ whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
174
+ ```
175
+
176
+ Note that transcription occasionally might be low accuracy when it works in parallel.
177
+
178
  ### Segments ###
179
 
180
  Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
 
309
 
310
  If something seems wrong on build, running `rake clean` solves some cases.
311
 
312
+ ### Need help ###
313
+
314
+ * Windows support
315
+ * Refinement of C/C++ code, especially memory management
316
+
317
  License
318
  -------
319
 
bindings/ruby/Rakefile CHANGED
@@ -67,17 +67,15 @@ file LIB_FILE => [SO_FILE, "lib"] do |t|
67
  end
68
  CLEAN.include LIB_FILE
69
 
70
- Rake::TestTask.new do |t|
71
- t.test_files = FileList["tests/test_*.rb"]
72
- end
73
 
74
- TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
75
- file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
76
- chdir "tests/jfk_reader" do
77
  ruby "extconf.rb"
78
  sh "make"
79
  end
80
  end
81
- CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
82
 
83
  task test: [LIB_FILE, TEST_MEMORY_VIEW]
 
67
  end
68
  CLEAN.include LIB_FILE
69
 
70
+ Rake::TestTask.new
 
 
71
 
72
+ TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
73
+ file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t|
74
+ chdir "test/jfk_reader" do
75
  ruby "extconf.rb"
76
  sh "make"
77
  end
78
  end
79
+ CLEAN.include "test/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
80
 
81
  task test: [LIB_FILE, TEST_MEMORY_VIEW]
bindings/ruby/ext/ruby_whisper.c CHANGED
@@ -24,6 +24,7 @@ ID id_URI;
24
  ID id_pre_converted_models;
25
  ID id_coreml_compiled_models;
26
  ID id_cache;
 
27
 
28
  static bool is_log_callback_finalized = false;
29
 
@@ -142,6 +143,7 @@ void Init_whisper() {
142
  id_pre_converted_models = rb_intern("pre_converted_models");
143
  id_coreml_compiled_models = rb_intern("coreml_compiled_models");
144
  id_cache = rb_intern("cache");
 
145
 
146
  mWhisper = rb_define_module("Whisper");
147
  mVAD = rb_define_module_under(mWhisper, "VAD");
 
24
  ID id_pre_converted_models;
25
  ID id_coreml_compiled_models;
26
  ID id_cache;
27
+ ID id_n_processors;
28
 
29
  static bool is_log_callback_finalized = false;
30
 
 
143
  id_pre_converted_models = rb_intern("pre_converted_models");
144
  id_coreml_compiled_models = rb_intern("coreml_compiled_models");
145
  id_cache = rb_intern("cache");
146
+ id_n_processors = rb_intern("n_processors");
147
 
148
  mWhisper = rb_define_module("Whisper");
149
  mVAD = rb_define_module_under(mWhisper, "VAD");
bindings/ruby/ext/ruby_whisper_context.c CHANGED
@@ -13,6 +13,7 @@ extern ID id_URI;
13
  extern ID id_pre_converted_models;
14
  extern ID id_coreml_compiled_models;
15
  extern ID id_cache;
 
16
 
17
  extern VALUE cContext;
18
  extern VALUE eError;
@@ -24,6 +25,8 @@ extern VALUE rb_whisper_model_s_new(VALUE context);
24
  extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
25
  extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
26
 
 
 
27
  static void
28
  ruby_whisper_free(ruby_whisper *rw)
29
  {
@@ -633,6 +636,8 @@ init_ruby_whisper_context(VALUE *mWhisper)
633
  {
634
  cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
635
 
 
 
636
  rb_define_alloc_func(cContext, ruby_whisper_allocate);
637
  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
638
 
 
13
  extern ID id_pre_converted_models;
14
  extern ID id_coreml_compiled_models;
15
  extern ID id_cache;
16
+ extern ID id_n_processors;
17
 
18
  extern VALUE cContext;
19
  extern VALUE eError;
 
25
  extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
26
  extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
27
 
28
+ ID transcribe_option_names[1];
29
+
30
  static void
31
  ruby_whisper_free(ruby_whisper *rw)
32
  {
 
636
  {
637
  cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
638
 
639
+ transcribe_option_names[0] = id_n_processors;
640
+
641
  rb_define_alloc_func(cContext, ruby_whisper_allocate);
642
  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
643
 
bindings/ruby/ext/ruby_whisper_transcribe.cpp CHANGED
@@ -13,6 +13,7 @@ extern const rb_data_type_t ruby_whisper_params_type;
13
 
14
  extern ID id_to_s;
15
  extern ID id_call;
 
16
 
17
  extern void
18
  prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
@@ -34,9 +35,14 @@ VALUE
34
  ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
35
  ruby_whisper *rw;
36
  ruby_whisper_params *rwp;
37
- VALUE wave_file_path, blk, params;
 
 
 
 
 
 
38
 
39
- rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
40
  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
41
  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
42
 
@@ -66,7 +72,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
66
 
67
  prepare_transcription(rwp, &self);
68
 
69
- if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
70
  fprintf(stderr, "failed to process audio\n");
71
  return self;
72
  }
@@ -76,9 +82,8 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
76
  const char * text = whisper_full_get_segment_text(rw->context, i);
77
  output = rb_str_concat(output, rb_str_new2(text));
78
  }
79
- VALUE idCall = id_call;
80
  if (blk != Qnil) {
81
- rb_funcall(blk, idCall, 1, output);
82
  }
83
  return self;
84
  }
 
13
 
14
  extern ID id_to_s;
15
  extern ID id_call;
16
+ extern ID transcribe_option_names[1];
17
 
18
  extern void
19
  prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
 
35
  ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
36
  ruby_whisper *rw;
37
  ruby_whisper_params *rwp;
38
+ VALUE wave_file_path, blk, params, kws;
39
+ VALUE opts[1];
40
+
41
+ rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, &params, &kws, &blk);
42
+ rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
43
+
44
+ int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
45
 
 
46
  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
47
  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
48
 
 
72
 
73
  prepare_transcription(rwp, &self);
74
 
75
+ if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
76
  fprintf(stderr, "failed to process audio\n");
77
  return self;
78
  }
 
82
  const char * text = whisper_full_get_segment_text(rw->context, i);
83
  output = rb_str_concat(output, rb_str_new2(text));
84
  }
 
85
  if (blk != Qnil) {
86
+ rb_funcall(blk, id_call, 1, output);
87
  }
88
  return self;
89
  }
bindings/ruby/sig/whisper.rbs CHANGED
@@ -25,19 +25,19 @@ module Whisper
25
  def self.system_info_str: () -> String
26
 
27
  class Context
28
- def self.new: (path | ::URI::HTTP) -> instance
29
 
30
  # transcribe a single file
31
  # can emit to a block results
32
  #
33
- # params = Whisper::Params.new
34
- # params.duration = 60_000
35
- # whisper.transcribe "path/to/audio.wav", params do |text|
36
- # puts text
37
- # end
38
  #
39
- def transcribe: (string, Params) -> self
40
- | (string, Params) { (String) -> void } -> self
41
 
42
  def model_n_vocab: () -> Integer
43
  def model_n_audio_ctx: () -> Integer
@@ -50,16 +50,16 @@ module Whisper
50
 
51
  # Yields each Whisper::Segment:
52
  #
53
- # whisper.transcribe("path/to/audio.wav", params)
54
- # whisper.each_segment do |segment|
55
- # puts segment.text
56
- # end
57
  #
58
  # Returns an Enumerator if no block given:
59
  #
60
- # whisper.transcribe("path/to/audio.wav", params)
61
- # enum = whisper.each_segment
62
- # enum.to_a # => [#<Whisper::Segment>, ...]
63
  #
64
  def each_segment: { (Segment) -> void } -> void
65
  | () -> Enumerator[Segment]
@@ -74,25 +74,25 @@ module Whisper
74
 
75
  # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
76
  #
77
- # full_get_segment_t0(3) # => 1668 (16680 ms)
78
  #
79
  def full_get_segment_t0: (Integer) -> Integer
80
 
81
  # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
82
  #
83
- # full_get_segment_t1(3) # => 1668 (16680 ms)
84
  #
85
  def full_get_segment_t1: (Integer) -> Integer
86
 
87
  # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
88
  #
89
- # full_get_segment_speacker_turn_next(3) # => true
90
  #
91
  def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
92
 
93
  # Text of a segment indexed by +segment_index+.
94
  #
95
- # full_get_segment_text(3) # => "ask not what your country can do for you, ..."
96
  #
97
  def full_get_segment_text: (Integer) -> String
98
 
@@ -282,9 +282,9 @@ module Whisper
282
 
283
  # Sets new segment callback, called for every newly generated text segment.
284
  #
285
- # params.new_segment_callback = ->(context, _, n_new, user_data) {
286
- # # ...
287
- # }
288
  #
289
  def new_segment_callback=: (new_segment_callback) -> new_segment_callback
290
  def new_segment_callback: () -> (new_segment_callback | nil)
@@ -297,9 +297,9 @@ module Whisper
297
 
298
  # Sets progress callback, called on each progress update.
299
  #
300
- # params.new_segment_callback = ->(context, _, progress, user_data) {
301
- # # ...
302
- # }
303
  #
304
  # +progress+ is an Integer between 0 and 100.
305
  #
@@ -327,9 +327,9 @@ module Whisper
327
 
328
  # Sets abort callback, called to check if the process should be aborted.
329
  #
330
- # params.abort_callback = ->(user_data) {
331
- # # ...
332
- # }
333
  #
334
  #
335
  def abort_callback=: (abort_callback) -> abort_callback
@@ -358,9 +358,9 @@ module Whisper
358
 
359
  # Hook called on new segment. Yields each Whisper::Segment.
360
  #
361
- # whisper.on_new_segment do |segment|
362
- # # ...
363
- # end
364
  #
365
  def on_new_segment: { (Segment) -> void } -> void
366
 
@@ -374,13 +374,13 @@ module Whisper
374
 
375
  # Call block to determine whether abort or not. Return +true+ when you want to abort.
376
  #
377
- # params.abort_on do
378
- # if some_condition
379
- # true # abort
380
- # else
381
- # false # continue
 
382
  # end
383
- # end
384
  #
385
  def abort_on: { (Object user_data) -> boolish } -> void
386
  end
 
25
  def self.system_info_str: () -> String
26
 
27
  class Context
28
+ def self.new: (String | path | ::URI::HTTP) -> instance
29
 
30
  # transcribe a single file
31
  # can emit to a block results
32
  #
33
+ # params = Whisper::Params.new
34
+ # params.duration = 60_000
35
+ # whisper.transcribe "path/to/audio.wav", params do |text|
36
+ # puts text
37
+ # end
38
  #
39
+ def transcribe: (string, Params, ?n_processors: Integer) -> self
40
+ | (string, Params, ?n_processors: Integer) { (String) -> void } -> self
41
 
42
  def model_n_vocab: () -> Integer
43
  def model_n_audio_ctx: () -> Integer
 
50
 
51
  # Yields each Whisper::Segment:
52
  #
53
+ # whisper.transcribe("path/to/audio.wav", params)
54
+ # whisper.each_segment do |segment|
55
+ # puts segment.text
56
+ # end
57
  #
58
  # Returns an Enumerator if no block given:
59
  #
60
+ # whisper.transcribe("path/to/audio.wav", params)
61
+ # enum = whisper.each_segment
62
+ # enum.to_a # => [#<Whisper::Segment>, ...]
63
  #
64
  def each_segment: { (Segment) -> void } -> void
65
  | () -> Enumerator[Segment]
 
74
 
75
  # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
76
  #
77
+ # full_get_segment_t0(3) # => 1668 (16680 ms)
78
  #
79
  def full_get_segment_t0: (Integer) -> Integer
80
 
81
  # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
82
  #
83
+ # full_get_segment_t1(3) # => 1668 (16680 ms)
84
  #
85
  def full_get_segment_t1: (Integer) -> Integer
86
 
87
  # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
88
  #
89
+ # full_get_segment_speacker_turn_next(3) # => true
90
  #
91
  def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
92
 
93
  # Text of a segment indexed by +segment_index+.
94
  #
95
+ # full_get_segment_text(3) # => "ask not what your country can do for you, ..."
96
  #
97
  def full_get_segment_text: (Integer) -> String
98
 
 
282
 
283
  # Sets new segment callback, called for every newly generated text segment.
284
  #
285
+ # params.new_segment_callback = ->(context, _, n_new, user_data) {
286
+ # # ...
287
+ # }
288
  #
289
  def new_segment_callback=: (new_segment_callback) -> new_segment_callback
290
  def new_segment_callback: () -> (new_segment_callback | nil)
 
297
 
298
  # Sets progress callback, called on each progress update.
299
  #
300
+ # params.new_segment_callback = ->(context, _, progress, user_data) {
301
+ # # ...
302
+ # }
303
  #
304
  # +progress+ is an Integer between 0 and 100.
305
  #
 
327
 
328
  # Sets abort callback, called to check if the process should be aborted.
329
  #
330
+ # params.abort_callback = ->(user_data) {
331
+ # # ...
332
+ # }
333
  #
334
  #
335
  def abort_callback=: (abort_callback) -> abort_callback
 
358
 
359
  # Hook called on new segment. Yields each Whisper::Segment.
360
  #
361
+ # whisper.on_new_segment do |segment|
362
+ # # ...
363
+ # end
364
  #
365
  def on_new_segment: { (Segment) -> void } -> void
366
 
 
374
 
375
  # Call block to determine whether abort or not. Return +true+ when you want to abort.
376
  #
377
+ # params.abort_on do
378
+ # if some_condition
379
+ # true # abort
380
+ # else
381
+ # false # continue
382
+ # end
383
  # end
 
384
  #
385
  def abort_on: { (Object user_data) -> boolish } -> void
386
  end
bindings/ruby/{tests β†’ test}/helper.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/jfk_reader/.gitignore RENAMED
File without changes
bindings/ruby/{tests β†’ test}/jfk_reader/extconf.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/jfk_reader/jfk_reader.c RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_callback.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_error.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_model.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_package.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_params.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_segment.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_vad.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_vad_params.rb RENAMED
File without changes
bindings/ruby/{tests β†’ test}/test_whisper.rb RENAMED
@@ -20,6 +20,24 @@ class TestWhisper < TestBase
20
  }
21
  end
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  sub_test_case "After transcription" do
24
  def test_full_n_segments
25
  assert_equal 1, whisper.full_n_segments
 
20
  }
21
  end
22
 
23
+ def test_transcribe_non_parallel
24
+ @whisper = Whisper::Context.new("base.en")
25
+ params = Whisper::Params.new
26
+
27
+ @whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
28
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
29
+ }
30
+ end
31
+
32
+ def test_transcribe_n_processors
33
+ @whisper = Whisper::Context.new("base.en")
34
+ params = Whisper::Params.new
35
+
36
+ @whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
37
+ assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
38
+ }
39
+ end
40
+
41
  sub_test_case "After transcription" do
42
  def test_full_n_segments
43
  assert_equal 1, whisper.full_n_segments
bindings/ruby/whispercpp.gemspec CHANGED
@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
4
  s.name = "whispercpp"
5
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
6
  s.version = '1.3.3'
7
- s.date = '2025-06-01'
8
  s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
9
  s.email = '[email protected]'
10
  s.extra_rdoc_files = ['LICENSE', 'README.md']
@@ -21,7 +21,7 @@ Gem::Specification.new do |s|
21
  }
22
 
23
  s.summary = %q{Ruby whisper.cpp bindings}
24
- s.test_files = s.files.select {|file| file.start_with? "tests/"}
25
 
26
  s.extensions << 'ext/extconf.rb'
27
  s.required_ruby_version = '>= 3.1.0'
 
4
  s.name = "whispercpp"
5
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
6
  s.version = '1.3.3'
7
+ s.date = '2025-06-03'
8
  s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
9
  s.email = '[email protected]'
10
  s.extra_rdoc_files = ['LICENSE', 'README.md']
 
21
  }
22
 
23
  s.summary = %q{Ruby whisper.cpp bindings}
24
+ s.test_files = s.files.select {|file| file.start_with? "test/"}
25
 
26
  s.extensions << 'ext/extconf.rb'
27
  s.required_ruby_version = '>= 3.1.0'