KitaitiMakoto commited on
Commit
1ee7297
·
unverified ·
1 Parent(s): f44915b

ruby : add VAD support, migration to Ruby's newer API (#3197)

Browse files

* Add VAD models

* Extract function to normalize model path from ruby_whisper_initialize()

* Define ruby_whisper_vad_params struct

* Add VAD-related features to Whisper::Params

* Add tests for VAD-related features

* Define Whisper::VADParams

* Add Whisper::VAD::Params attributes

* Add test suite for VAD::Params

* Make older test to follow namespace change

* Add test for transcription with VAD

* Add assertion for test_vad_params

* Add signatures for VAD-related methods

* Define VAD::Params#==

* Add test for VAD::Params#==

* Fix Params#vad_params

* Add test for Params#vad_params

* Fix signature of Params#vad_params

* Use macro to define VAD::Params params

* Define VAD::Params#initialize

* Add tests for VAD::Params#initialize

* Add signature for VAD::Params.new

* Add documentation on VAD in README

* Wrap register_callbask in prepare_transcription for clear meanings

* Set whisper_params.vad_params just before transcription

* Don't touch NULL

* Define ruby_whisper_params_type

* Use TypedData_XXX for ruby_whisper_params instead of Data_XXX

* Remove unused functions

* Define rb_whisper_model_data_type

* Use TypedData_XXX for ruby_whisper_model instead of Data_XXX

* Define ruby_whisper_segment_type

* Use TypedData_XXX for ruby_whisper_segment instead of Data_XXX

* Define ruby_whisper_type

* Use TypedData_XXX for ruby_whisper instead of Data_XXX

* Qualify with const

bindings/ruby/README.md CHANGED
@@ -111,6 +111,41 @@ See [models][] page for details.
111
 
112
  Currently, whisper.cpp accepts only 16-bit WAV files.
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  API
115
  ---
116
 
 
111
 
112
  Currently, whisper.cpp accepts only 16-bit WAV files.
113
 
114
+ ### Voice Activity Detection (VAD) ###
115
+
116
+ Support for Voice Activity Detection (VAD) can be enabled by setting `Whisper::Params`'s `vad` argument to `true` and specifying VAD model:
117
+
118
+ ```ruby
119
+ Whisper::Params.new(
120
+ vad: true,
121
+ vad_model_path: "silero-v5.1.2",
122
+ # other arguments...
123
+ )
124
+ ```
125
+
126
+ When you pass the model name (`"silero-v5.1.2"`) or URI (`https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin`), it will be downloaded automatically.
127
+ Currently, "silero-v5.1.2" is registered as pre-converted model like ASR models. You also specify file path or URI of model.
128
+
129
+ If you need configure VAD behavior, pass params for that:
130
+
131
+ ```ruby
132
+ Whisper::Params.new(
133
+ vad: true,
134
+ vad_model_path: "silero-v5.1.2",
135
+ vad_params: Whisper::VAD::Params.new(
136
+ threshold: 1.0, # defaults to 0.5
137
+ min_speech_duration_ms: 500, # defaults to 250
138
+ min_silence_duration_ms: 200, # defaults to 100
139
+ max_speech_duration_s: 30000, # default is FLT_MAX,
140
+ speech_pad_ms: 50, # defaults to 30
141
+ samples_overlap: 0.5 # defaults to 0.1
142
+ ),
143
+ # other arguments...
144
+ )
145
+ ```
146
+
147
+ For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
148
+
149
  API
150
  ---
151
 
bindings/ruby/ext/ruby_whisper.c CHANGED
@@ -3,8 +3,10 @@
3
  #include "ruby_whisper.h"
4
 
5
  VALUE mWhisper;
 
6
  VALUE cContext;
7
  VALUE cParams;
 
8
  VALUE eError;
9
 
10
  VALUE cSegment;
@@ -31,6 +33,7 @@ extern void init_ruby_whisper_params(VALUE *mWhisper);
31
  extern void init_ruby_whisper_error(VALUE *mWhisper);
32
  extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
33
  extern void init_ruby_whisper_model(VALUE *mWhisper);
 
34
  extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
35
 
36
  /*
@@ -116,16 +119,6 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
116
  return Qnil;
117
  }
118
 
119
- static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
120
- rb_gc_mark(rwm->context);
121
- }
122
-
123
- static VALUE ruby_whisper_model_allocate(VALUE klass) {
124
- ruby_whisper_model *rwm;
125
- rwm = ALLOC(ruby_whisper_model);
126
- return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
127
- }
128
-
129
  void Init_whisper() {
130
  id_to_s = rb_intern("to_s");
131
  id_call = rb_intern("call");
@@ -139,6 +132,7 @@ void Init_whisper() {
139
  id_pre_converted_models = rb_intern("pre_converted_models");
140
 
141
  mWhisper = rb_define_module("Whisper");
 
142
 
143
  rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
144
  rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
@@ -159,6 +153,7 @@ void Init_whisper() {
159
  init_ruby_whisper_error(&mWhisper);
160
  init_ruby_whisper_segment(&mWhisper, &cContext);
161
  init_ruby_whisper_model(&mWhisper);
 
162
 
163
  rb_require("whisper/model/uri");
164
  }
 
3
  #include "ruby_whisper.h"
4
 
5
  VALUE mWhisper;
6
+ VALUE mVAD;
7
  VALUE cContext;
8
  VALUE cParams;
9
+ VALUE cVADParams;
10
  VALUE eError;
11
 
12
  VALUE cSegment;
 
33
  extern void init_ruby_whisper_error(VALUE *mWhisper);
34
  extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
35
  extern void init_ruby_whisper_model(VALUE *mWhisper);
36
+ extern void init_ruby_whisper_vad_params(VALUE *mVAD);
37
  extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
38
 
39
  /*
 
119
  return Qnil;
120
  }
121
 
 
 
 
 
 
 
 
 
 
 
122
  void Init_whisper() {
123
  id_to_s = rb_intern("to_s");
124
  id_call = rb_intern("call");
 
132
  id_pre_converted_models = rb_intern("pre_converted_models");
133
 
134
  mWhisper = rb_define_module("Whisper");
135
+ mVAD = rb_define_module_under(mWhisper, "VAD");
136
 
137
  rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
138
  rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
 
153
  init_ruby_whisper_error(&mWhisper);
154
  init_ruby_whisper_segment(&mWhisper, &cContext);
155
  init_ruby_whisper_model(&mWhisper);
156
+ init_ruby_whisper_vad_params(&mVAD);
157
 
158
  rb_require("whisper/model/uri");
159
  }
bindings/ruby/ext/ruby_whisper.h CHANGED
@@ -21,8 +21,13 @@ typedef struct {
21
  ruby_whisper_callback_container *progress_callback_container;
22
  ruby_whisper_callback_container *encoder_begin_callback_container;
23
  ruby_whisper_callback_container *abort_callback_container;
 
24
  } ruby_whisper_params;
25
 
 
 
 
 
26
  typedef struct {
27
  VALUE context;
28
  int index;
 
21
  ruby_whisper_callback_container *progress_callback_container;
22
  ruby_whisper_callback_container *encoder_begin_callback_container;
23
  ruby_whisper_callback_container *abort_callback_container;
24
+ VALUE vad_params;
25
  } ruby_whisper_params;
26
 
27
+ typedef struct {
28
+ struct whisper_vad_params params;
29
+ } ruby_whisper_vad_params;
30
+
31
  typedef struct {
32
  VALUE context;
33
  int index;
bindings/ruby/ext/ruby_whisper_context.c CHANGED
@@ -16,10 +16,11 @@ extern VALUE cContext;
16
  extern VALUE eError;
17
  extern VALUE cModel;
18
 
 
19
  extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
20
  extern VALUE rb_whisper_model_initialize(VALUE context);
21
  extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
22
- extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
23
 
24
  static void
25
  ruby_whisper_free(ruby_whisper *rw)
@@ -37,19 +38,64 @@ rb_whisper_mark(ruby_whisper *rw)
37
  }
38
 
39
  void
40
- rb_whisper_free(ruby_whisper *rw)
41
  {
 
42
  ruby_whisper_free(rw);
43
  free(rw);
44
  }
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  static VALUE
47
  ruby_whisper_allocate(VALUE klass)
48
  {
49
  ruby_whisper *rw;
50
- rw = ALLOC(ruby_whisper);
51
  rw->context = NULL;
52
- return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  }
54
 
55
  /*
@@ -66,27 +112,9 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
66
 
67
  // TODO: we can support init from buffer here too maybe another ruby object to expose
68
  rb_scan_args(argc, argv, "01", &whisper_model_file_path);
69
- Data_Get_Struct(self, ruby_whisper, rw);
70
 
71
- VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
72
- VALUE pre_converted_model = rb_hash_aref(pre_converted_models, whisper_model_file_path);
73
- if (!NIL_P(pre_converted_model)) {
74
- whisper_model_file_path = pre_converted_model;
75
- }
76
- if (TYPE(whisper_model_file_path) == T_STRING) {
77
- const char * whisper_model_file_path_str = StringValueCStr(whisper_model_file_path);
78
- if (strncmp("http://", whisper_model_file_path_str, 7) == 0 || strncmp("https://", whisper_model_file_path_str, 8) == 0) {
79
- VALUE uri_class = rb_const_get(cModel, id_URI);
80
- whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
81
- }
82
- }
83
- if (rb_obj_is_kind_of(whisper_model_file_path, rb_path2class("URI::HTTP"))) {
84
- VALUE uri_class = rb_const_get(cModel, id_URI);
85
- whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
86
- }
87
- if (rb_respond_to(whisper_model_file_path, id_to_path)) {
88
- whisper_model_file_path = rb_funcall(whisper_model_file_path, id_to_path, 0);
89
- }
90
  if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
91
  rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
92
  }
@@ -104,7 +132,7 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
104
  VALUE ruby_whisper_model_n_vocab(VALUE self)
105
  {
106
  ruby_whisper *rw;
107
- Data_Get_Struct(self, ruby_whisper, rw);
108
  return INT2NUM(whisper_model_n_vocab(rw->context));
109
  }
110
 
@@ -115,7 +143,7 @@ VALUE ruby_whisper_model_n_vocab(VALUE self)
115
  VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
116
  {
117
  ruby_whisper *rw;
118
- Data_Get_Struct(self, ruby_whisper, rw);
119
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
120
  }
121
 
@@ -126,7 +154,7 @@ VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
126
  VALUE ruby_whisper_model_n_audio_state(VALUE self)
127
  {
128
  ruby_whisper *rw;
129
- Data_Get_Struct(self, ruby_whisper, rw);
130
  return INT2NUM(whisper_model_n_audio_state(rw->context));
131
  }
132
 
@@ -137,7 +165,7 @@ VALUE ruby_whisper_model_n_audio_state(VALUE self)
137
  VALUE ruby_whisper_model_n_audio_head(VALUE self)
138
  {
139
  ruby_whisper *rw;
140
- Data_Get_Struct(self, ruby_whisper, rw);
141
  return INT2NUM(whisper_model_n_audio_head(rw->context));
142
  }
143
 
@@ -148,7 +176,7 @@ VALUE ruby_whisper_model_n_audio_head(VALUE self)
148
  VALUE ruby_whisper_model_n_audio_layer(VALUE self)
149
  {
150
  ruby_whisper *rw;
151
- Data_Get_Struct(self, ruby_whisper, rw);
152
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
153
  }
154
 
@@ -159,7 +187,7 @@ VALUE ruby_whisper_model_n_audio_layer(VALUE self)
159
  VALUE ruby_whisper_model_n_text_ctx(VALUE self)
160
  {
161
  ruby_whisper *rw;
162
- Data_Get_Struct(self, ruby_whisper, rw);
163
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
164
  }
165
 
@@ -170,7 +198,7 @@ VALUE ruby_whisper_model_n_text_ctx(VALUE self)
170
  VALUE ruby_whisper_model_n_text_state(VALUE self)
171
  {
172
  ruby_whisper *rw;
173
- Data_Get_Struct(self, ruby_whisper, rw);
174
  return INT2NUM(whisper_model_n_text_state(rw->context));
175
  }
176
 
@@ -181,7 +209,7 @@ VALUE ruby_whisper_model_n_text_state(VALUE self)
181
  VALUE ruby_whisper_model_n_text_head(VALUE self)
182
  {
183
  ruby_whisper *rw;
184
- Data_Get_Struct(self, ruby_whisper, rw);
185
  return INT2NUM(whisper_model_n_text_head(rw->context));
186
  }
187
 
@@ -192,7 +220,7 @@ VALUE ruby_whisper_model_n_text_head(VALUE self)
192
  VALUE ruby_whisper_model_n_text_layer(VALUE self)
193
  {
194
  ruby_whisper *rw;
195
- Data_Get_Struct(self, ruby_whisper, rw);
196
  return INT2NUM(whisper_model_n_text_layer(rw->context));
197
  }
198
 
@@ -203,7 +231,7 @@ VALUE ruby_whisper_model_n_text_layer(VALUE self)
203
  VALUE ruby_whisper_model_n_mels(VALUE self)
204
  {
205
  ruby_whisper *rw;
206
- Data_Get_Struct(self, ruby_whisper, rw);
207
  return INT2NUM(whisper_model_n_mels(rw->context));
208
  }
209
 
@@ -214,7 +242,7 @@ VALUE ruby_whisper_model_n_mels(VALUE self)
214
  VALUE ruby_whisper_model_ftype(VALUE self)
215
  {
216
  ruby_whisper *rw;
217
- Data_Get_Struct(self, ruby_whisper, rw);
218
  return INT2NUM(whisper_model_ftype(rw->context));
219
  }
220
 
@@ -225,7 +253,7 @@ VALUE ruby_whisper_model_ftype(VALUE self)
225
  VALUE ruby_whisper_model_type(VALUE self)
226
  {
227
  ruby_whisper *rw;
228
- Data_Get_Struct(self, ruby_whisper, rw);
229
  return rb_str_new2(whisper_model_type_readable(rw->context));
230
  }
231
 
@@ -248,9 +276,9 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
248
 
249
  ruby_whisper *rw;
250
  ruby_whisper_params *rwp;
251
- Data_Get_Struct(self, ruby_whisper, rw);
252
  VALUE params = argv[0];
253
- Data_Get_Struct(params, ruby_whisper_params, rwp);
254
  VALUE samples = argv[1];
255
  int n_samples;
256
  rb_memory_view_t view;
@@ -296,7 +324,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
296
  }
297
  }
298
  }
299
- register_callbacks(rwp, &self);
300
  const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
301
  if (0 == result) {
302
  return self;
@@ -327,9 +355,9 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
327
 
328
  ruby_whisper *rw;
329
  ruby_whisper_params *rwp;
330
- Data_Get_Struct(self, ruby_whisper, rw);
331
  VALUE params = argv[0];
332
- Data_Get_Struct(params, ruby_whisper_params, rwp);
333
  VALUE samples = argv[1];
334
  int n_samples;
335
  int n_processors;
@@ -387,7 +415,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
387
  }
388
  }
389
  }
390
- register_callbacks(rwp, &self);
391
  const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
392
  if (0 == result) {
393
  return self;
@@ -406,7 +434,7 @@ static VALUE
406
  ruby_whisper_full_n_segments(VALUE self)
407
  {
408
  ruby_whisper *rw;
409
- Data_Get_Struct(self, ruby_whisper, rw);
410
  return INT2NUM(whisper_full_n_segments(rw->context));
411
  }
412
 
@@ -420,7 +448,7 @@ static VALUE
420
  ruby_whisper_full_lang_id(VALUE self)
421
  {
422
  ruby_whisper *rw;
423
- Data_Get_Struct(self, ruby_whisper, rw);
424
  return INT2NUM(whisper_full_lang_id(rw->context));
425
  }
426
 
@@ -445,7 +473,7 @@ static VALUE
445
  ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
446
  {
447
  ruby_whisper *rw;
448
- Data_Get_Struct(self, ruby_whisper, rw);
449
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
450
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
451
  return INT2NUM(t0);
@@ -463,7 +491,7 @@ static VALUE
463
  ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
464
  {
465
  ruby_whisper *rw;
466
- Data_Get_Struct(self, ruby_whisper, rw);
467
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
468
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
469
  return INT2NUM(t1);
@@ -481,7 +509,7 @@ static VALUE
481
  ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
482
  {
483
  ruby_whisper *rw;
484
- Data_Get_Struct(self, ruby_whisper, rw);
485
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
486
  const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
487
  return speaker_turn_next ? Qtrue : Qfalse;
@@ -499,7 +527,7 @@ static VALUE
499
  ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
500
  {
501
  ruby_whisper *rw;
502
- Data_Get_Struct(self, ruby_whisper, rw);
503
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
504
  const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
505
  return rb_str_new2(text);
@@ -513,7 +541,7 @@ static VALUE
513
  ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
514
  {
515
  ruby_whisper *rw;
516
- Data_Get_Struct(self, ruby_whisper, rw);
517
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
518
  const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
519
  return DBL2NUM(no_speech_prob);
@@ -554,7 +582,7 @@ ruby_whisper_each_segment(VALUE self)
554
  }
555
 
556
  ruby_whisper *rw;
557
- Data_Get_Struct(self, ruby_whisper, rw);
558
 
559
  const int n_segments = whisper_full_n_segments(rw->context);
560
  for (int i = 0; i < n_segments; ++i) {
 
16
  extern VALUE eError;
17
  extern VALUE cModel;
18
 
19
+ extern const rb_data_type_t ruby_whisper_params_type;
20
  extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
21
  extern VALUE rb_whisper_model_initialize(VALUE context);
22
  extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
23
+ extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
24
 
25
  static void
26
  ruby_whisper_free(ruby_whisper *rw)
 
38
  }
39
 
40
  void
41
+ rb_whisper_free(void *p)
42
  {
43
+ ruby_whisper *rw = (ruby_whisper *)p;
44
  ruby_whisper_free(rw);
45
  free(rw);
46
  }
47
 
48
+ static size_t
49
+ ruby_whisper_memsize(const void *p)
50
+ {
51
+ const ruby_whisper *rw = (const ruby_whisper *)p;
52
+ size_t size = sizeof(rw);
53
+ if (!rw) {
54
+ return 0;
55
+ }
56
+ return size;
57
+ }
58
+
59
+ const rb_data_type_t ruby_whisper_type = {
60
+ "ruby_whisper",
61
+ {0, rb_whisper_free, ruby_whisper_memsize,},
62
+ 0, 0,
63
+ 0
64
+ };
65
+
66
  static VALUE
67
  ruby_whisper_allocate(VALUE klass)
68
  {
69
  ruby_whisper *rw;
70
+ VALUE obj = TypedData_Make_Struct(klass, ruby_whisper, &ruby_whisper_type, rw);
71
  rw->context = NULL;
72
+ return obj;
73
+ }
74
+
75
+ VALUE
76
+ ruby_whisper_normalize_model_path(VALUE model_path)
77
+ {
78
+ VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
79
+ VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
80
+ if (!NIL_P(pre_converted_model)) {
81
+ model_path = pre_converted_model;
82
+ }
83
+ else if (TYPE(model_path) == T_STRING) {
84
+ const char * model_path_str = StringValueCStr(model_path);
85
+ if (strncmp("http://", model_path_str, 7) == 0 || strncmp("https://", model_path_str, 8) == 0) {
86
+ VALUE uri_class = rb_const_get(cModel, id_URI);
87
+ model_path = rb_class_new_instance(1, &model_path, uri_class);
88
+ }
89
+ }
90
+ else if (rb_obj_is_kind_of(model_path, rb_path2class("URI::HTTP"))) {
91
+ VALUE uri_class = rb_const_get(cModel, id_URI);
92
+ model_path = rb_class_new_instance(1, &model_path, uri_class);
93
+ }
94
+ if (rb_respond_to(model_path, id_to_path)) {
95
+ model_path = rb_funcall(model_path, id_to_path, 0);
96
+ }
97
+
98
+ return model_path;
99
  }
100
 
101
  /*
 
112
 
113
  // TODO: we can support init from buffer here too maybe another ruby object to expose
114
  rb_scan_args(argc, argv, "01", &whisper_model_file_path);
115
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
116
 
117
+ whisper_model_file_path = ruby_whisper_normalize_model_path(whisper_model_file_path);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
119
  rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
120
  }
 
132
  VALUE ruby_whisper_model_n_vocab(VALUE self)
133
  {
134
  ruby_whisper *rw;
135
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
136
  return INT2NUM(whisper_model_n_vocab(rw->context));
137
  }
138
 
 
143
  VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
144
  {
145
  ruby_whisper *rw;
146
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
147
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
148
  }
149
 
 
154
  VALUE ruby_whisper_model_n_audio_state(VALUE self)
155
  {
156
  ruby_whisper *rw;
157
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
158
  return INT2NUM(whisper_model_n_audio_state(rw->context));
159
  }
160
 
 
165
  VALUE ruby_whisper_model_n_audio_head(VALUE self)
166
  {
167
  ruby_whisper *rw;
168
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
169
  return INT2NUM(whisper_model_n_audio_head(rw->context));
170
  }
171
 
 
176
  VALUE ruby_whisper_model_n_audio_layer(VALUE self)
177
  {
178
  ruby_whisper *rw;
179
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
180
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
181
  }
182
 
 
187
  VALUE ruby_whisper_model_n_text_ctx(VALUE self)
188
  {
189
  ruby_whisper *rw;
190
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
191
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
192
  }
193
 
 
198
  VALUE ruby_whisper_model_n_text_state(VALUE self)
199
  {
200
  ruby_whisper *rw;
201
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
202
  return INT2NUM(whisper_model_n_text_state(rw->context));
203
  }
204
 
 
209
  VALUE ruby_whisper_model_n_text_head(VALUE self)
210
  {
211
  ruby_whisper *rw;
212
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
213
  return INT2NUM(whisper_model_n_text_head(rw->context));
214
  }
215
 
 
220
  VALUE ruby_whisper_model_n_text_layer(VALUE self)
221
  {
222
  ruby_whisper *rw;
223
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
224
  return INT2NUM(whisper_model_n_text_layer(rw->context));
225
  }
226
 
 
231
  VALUE ruby_whisper_model_n_mels(VALUE self)
232
  {
233
  ruby_whisper *rw;
234
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
235
  return INT2NUM(whisper_model_n_mels(rw->context));
236
  }
237
 
 
242
  VALUE ruby_whisper_model_ftype(VALUE self)
243
  {
244
  ruby_whisper *rw;
245
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
246
  return INT2NUM(whisper_model_ftype(rw->context));
247
  }
248
 
 
253
  VALUE ruby_whisper_model_type(VALUE self)
254
  {
255
  ruby_whisper *rw;
256
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
257
  return rb_str_new2(whisper_model_type_readable(rw->context));
258
  }
259
 
 
276
 
277
  ruby_whisper *rw;
278
  ruby_whisper_params *rwp;
279
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
280
  VALUE params = argv[0];
281
+ TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
282
  VALUE samples = argv[1];
283
  int n_samples;
284
  rb_memory_view_t view;
 
324
  }
325
  }
326
  }
327
+ prepare_transcription(rwp, &self);
328
  const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
329
  if (0 == result) {
330
  return self;
 
355
 
356
  ruby_whisper *rw;
357
  ruby_whisper_params *rwp;
358
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
359
  VALUE params = argv[0];
360
+ TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
361
  VALUE samples = argv[1];
362
  int n_samples;
363
  int n_processors;
 
415
  }
416
  }
417
  }
418
+ prepare_transcription(rwp, &self);
419
  const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
420
  if (0 == result) {
421
  return self;
 
434
  ruby_whisper_full_n_segments(VALUE self)
435
  {
436
  ruby_whisper *rw;
437
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
438
  return INT2NUM(whisper_full_n_segments(rw->context));
439
  }
440
 
 
448
  ruby_whisper_full_lang_id(VALUE self)
449
  {
450
  ruby_whisper *rw;
451
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
452
  return INT2NUM(whisper_full_lang_id(rw->context));
453
  }
454
 
 
473
  ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
474
  {
475
  ruby_whisper *rw;
476
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
477
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
478
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
479
  return INT2NUM(t0);
 
491
  ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
492
  {
493
  ruby_whisper *rw;
494
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
495
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
496
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
497
  return INT2NUM(t1);
 
509
  ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
510
  {
511
  ruby_whisper *rw;
512
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
513
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
514
  const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
515
  return speaker_turn_next ? Qtrue : Qfalse;
 
527
  ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
528
  {
529
  ruby_whisper *rw;
530
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
531
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
532
  const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
533
  return rb_str_new2(text);
 
541
  ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
542
  {
543
  ruby_whisper *rw;
544
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
545
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
546
  const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
547
  return DBL2NUM(no_speech_prob);
 
582
  }
583
 
584
  ruby_whisper *rw;
585
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
586
 
587
  const int n_segments = whisper_full_n_segments(rw->context);
588
  for (int i = 0; i < n_segments; ++i) {
bindings/ruby/ext/ruby_whisper_model.c CHANGED
@@ -1,22 +1,44 @@
1
  #include <ruby.h>
2
  #include "ruby_whisper.h"
3
 
 
 
4
  extern VALUE cModel;
5
 
6
- static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
7
- rb_gc_mark(rwm->context);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  }
9
 
 
 
 
 
 
 
 
10
  static VALUE ruby_whisper_model_allocate(VALUE klass) {
11
  ruby_whisper_model *rwm;
12
- rwm = ALLOC(ruby_whisper_model);
13
- return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
14
  }
15
 
16
  VALUE rb_whisper_model_initialize(VALUE context) {
17
  ruby_whisper_model *rwm;
18
  const VALUE model = ruby_whisper_model_allocate(cModel);
19
- Data_Get_Struct(model, ruby_whisper_model, rwm);
20
  rwm->context = context;
21
  return model;
22
  };
@@ -29,9 +51,9 @@ static VALUE
29
  ruby_whisper_model_n_vocab(VALUE self)
30
  {
31
  ruby_whisper_model *rwm;
32
- Data_Get_Struct(self, ruby_whisper_model, rwm);
33
  ruby_whisper *rw;
34
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
35
  return INT2NUM(whisper_model_n_vocab(rw->context));
36
  }
37
 
@@ -43,9 +65,9 @@ static VALUE
43
  ruby_whisper_model_n_audio_ctx(VALUE self)
44
  {
45
  ruby_whisper_model *rwm;
46
- Data_Get_Struct(self, ruby_whisper_model, rwm);
47
  ruby_whisper *rw;
48
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
49
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
50
  }
51
 
@@ -57,9 +79,9 @@ static VALUE
57
  ruby_whisper_model_n_audio_state(VALUE self)
58
  {
59
  ruby_whisper_model *rwm;
60
- Data_Get_Struct(self, ruby_whisper_model, rwm);
61
  ruby_whisper *rw;
62
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
63
  return INT2NUM(whisper_model_n_audio_state(rw->context));
64
  }
65
 
@@ -71,9 +93,9 @@ static VALUE
71
  ruby_whisper_model_n_audio_head(VALUE self)
72
  {
73
  ruby_whisper_model *rwm;
74
- Data_Get_Struct(self, ruby_whisper_model, rwm);
75
  ruby_whisper *rw;
76
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
77
  return INT2NUM(whisper_model_n_audio_head(rw->context));
78
  }
79
 
@@ -85,9 +107,9 @@ static VALUE
85
  ruby_whisper_model_n_audio_layer(VALUE self)
86
  {
87
  ruby_whisper_model *rwm;
88
- Data_Get_Struct(self, ruby_whisper_model, rwm);
89
  ruby_whisper *rw;
90
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
91
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
92
  }
93
 
@@ -99,9 +121,9 @@ static VALUE
99
  ruby_whisper_model_n_text_ctx(VALUE self)
100
  {
101
  ruby_whisper_model *rwm;
102
- Data_Get_Struct(self, ruby_whisper_model, rwm);
103
  ruby_whisper *rw;
104
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
105
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
106
  }
107
 
@@ -113,9 +135,9 @@ static VALUE
113
  ruby_whisper_model_n_text_state(VALUE self)
114
  {
115
  ruby_whisper_model *rwm;
116
- Data_Get_Struct(self, ruby_whisper_model, rwm);
117
  ruby_whisper *rw;
118
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
119
  return INT2NUM(whisper_model_n_text_state(rw->context));
120
  }
121
 
@@ -127,9 +149,9 @@ static VALUE
127
  ruby_whisper_model_n_text_head(VALUE self)
128
  {
129
  ruby_whisper_model *rwm;
130
- Data_Get_Struct(self, ruby_whisper_model, rwm);
131
  ruby_whisper *rw;
132
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
133
  return INT2NUM(whisper_model_n_text_head(rw->context));
134
  }
135
 
@@ -141,9 +163,9 @@ static VALUE
141
  ruby_whisper_model_n_text_layer(VALUE self)
142
  {
143
  ruby_whisper_model *rwm;
144
- Data_Get_Struct(self, ruby_whisper_model, rwm);
145
  ruby_whisper *rw;
146
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
147
  return INT2NUM(whisper_model_n_text_layer(rw->context));
148
  }
149
 
@@ -155,9 +177,9 @@ static VALUE
155
  ruby_whisper_model_n_mels(VALUE self)
156
  {
157
  ruby_whisper_model *rwm;
158
- Data_Get_Struct(self, ruby_whisper_model, rwm);
159
  ruby_whisper *rw;
160
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
161
  return INT2NUM(whisper_model_n_mels(rw->context));
162
  }
163
 
@@ -169,9 +191,9 @@ static VALUE
169
  ruby_whisper_model_ftype(VALUE self)
170
  {
171
  ruby_whisper_model *rwm;
172
- Data_Get_Struct(self, ruby_whisper_model, rwm);
173
  ruby_whisper *rw;
174
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
175
  return INT2NUM(whisper_model_ftype(rw->context));
176
  }
177
 
@@ -183,9 +205,9 @@ static VALUE
183
  ruby_whisper_model_type(VALUE self)
184
  {
185
  ruby_whisper_model *rwm;
186
- Data_Get_Struct(self, ruby_whisper_model, rwm);
187
  ruby_whisper *rw;
188
- Data_Get_Struct(rwm->context, ruby_whisper, rw);
189
  return rb_str_new2(whisper_model_type_readable(rw->context));
190
  }
191
 
 
1
  #include <ruby.h>
2
  #include "ruby_whisper.h"
3
 
4
+ extern const rb_data_type_t ruby_whisper_type;
5
+
6
  extern VALUE cModel;
7
 
8
+ static void rb_whisper_model_mark(void *p) {
9
+ ruby_whisper_model *rwm = (ruby_whisper_model *)p;
10
+ if (rwm->context) {
11
+ rb_gc_mark(rwm->context);
12
+ }
13
+ }
14
+
15
+ static size_t
16
+ ruby_whisper_model_memsize(const void *p)
17
+ {
18
+ const ruby_whisper_model *rwm = (const ruby_whisper_model *)p;
19
+ size_t size = sizeof(rwm);
20
+ if (!rwm) {
21
+ return 0;
22
+ }
23
+ return size;
24
  }
25
 
26
+ static const rb_data_type_t rb_whisper_model_type = {
27
+ "ruby_whisper_model",
28
+ {rb_whisper_model_mark, RUBY_DEFAULT_FREE, ruby_whisper_model_memsize,},
29
+ 0, 0,
30
+ 0
31
+ };
32
+
33
  static VALUE ruby_whisper_model_allocate(VALUE klass) {
34
  ruby_whisper_model *rwm;
35
+ return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
 
36
  }
37
 
38
  VALUE rb_whisper_model_initialize(VALUE context) {
39
  ruby_whisper_model *rwm;
40
  const VALUE model = ruby_whisper_model_allocate(cModel);
41
+ TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
42
  rwm->context = context;
43
  return model;
44
  };
 
51
  ruby_whisper_model_n_vocab(VALUE self)
52
  {
53
  ruby_whisper_model *rwm;
54
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
55
  ruby_whisper *rw;
56
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
57
  return INT2NUM(whisper_model_n_vocab(rw->context));
58
  }
59
 
 
65
  ruby_whisper_model_n_audio_ctx(VALUE self)
66
  {
67
  ruby_whisper_model *rwm;
68
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
69
  ruby_whisper *rw;
70
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
71
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
72
  }
73
 
 
79
  ruby_whisper_model_n_audio_state(VALUE self)
80
  {
81
  ruby_whisper_model *rwm;
82
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
83
  ruby_whisper *rw;
84
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
85
  return INT2NUM(whisper_model_n_audio_state(rw->context));
86
  }
87
 
 
93
  ruby_whisper_model_n_audio_head(VALUE self)
94
  {
95
  ruby_whisper_model *rwm;
96
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
97
  ruby_whisper *rw;
98
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
99
  return INT2NUM(whisper_model_n_audio_head(rw->context));
100
  }
101
 
 
107
  ruby_whisper_model_n_audio_layer(VALUE self)
108
  {
109
  ruby_whisper_model *rwm;
110
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
111
  ruby_whisper *rw;
112
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
113
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
114
  }
115
 
 
121
  ruby_whisper_model_n_text_ctx(VALUE self)
122
  {
123
  ruby_whisper_model *rwm;
124
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
125
  ruby_whisper *rw;
126
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
127
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
128
  }
129
 
 
135
  ruby_whisper_model_n_text_state(VALUE self)
136
  {
137
  ruby_whisper_model *rwm;
138
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
139
  ruby_whisper *rw;
140
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
141
  return INT2NUM(whisper_model_n_text_state(rw->context));
142
  }
143
 
 
149
  ruby_whisper_model_n_text_head(VALUE self)
150
  {
151
  ruby_whisper_model *rwm;
152
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
153
  ruby_whisper *rw;
154
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
155
  return INT2NUM(whisper_model_n_text_head(rw->context));
156
  }
157
 
 
163
  ruby_whisper_model_n_text_layer(VALUE self)
164
  {
165
  ruby_whisper_model *rwm;
166
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
167
  ruby_whisper *rw;
168
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
169
  return INT2NUM(whisper_model_n_text_layer(rw->context));
170
  }
171
 
 
177
  ruby_whisper_model_n_mels(VALUE self)
178
  {
179
  ruby_whisper_model *rwm;
180
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
181
  ruby_whisper *rw;
182
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
183
  return INT2NUM(whisper_model_n_mels(rw->context));
184
  }
185
 
 
191
  ruby_whisper_model_ftype(VALUE self)
192
  {
193
  ruby_whisper_model *rwm;
194
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
195
  ruby_whisper *rw;
196
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
197
  return INT2NUM(whisper_model_ftype(rw->context));
198
  }
199
 
 
205
  ruby_whisper_model_type(VALUE self)
206
  {
207
  ruby_whisper_model *rwm;
208
+ TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
209
  ruby_whisper *rw;
210
+ TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
211
  return rb_str_new2(whisper_model_type_readable(rw->context));
212
  }
213
 
bindings/ruby/ext/ruby_whisper_params.c CHANGED
@@ -3,7 +3,7 @@
3
 
4
  #define BOOL_PARAMS_SETTER(self, prop, value) \
5
  ruby_whisper_params *rwp; \
6
- Data_Get_Struct(self, ruby_whisper_params, rwp); \
7
  if (value == Qfalse || value == Qnil) { \
8
  rwp->params.prop = false; \
9
  } else { \
@@ -13,7 +13,7 @@
13
 
14
  #define BOOL_PARAMS_GETTER(self, prop) \
15
  ruby_whisper_params *rwp; \
16
- Data_Get_Struct(self, ruby_whisper_params, rwp); \
17
  if (rwp->params.prop) { \
18
  return Qtrue; \
19
  } else { \
@@ -26,13 +26,16 @@
26
  rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \
27
  rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1);
28
 
29
- #define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 32
30
 
31
  extern VALUE cParams;
 
32
 
33
  extern ID id_call;
34
 
 
35
  extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
 
36
 
37
  static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
38
  static ID id_language;
@@ -67,6 +70,9 @@ static ID id_encoder_begin_callback;
67
  static ID id_encoder_begin_callback_user_data;
68
  static ID id_abort_callback;
69
  static ID id_abort_callback_user_data;
 
 
 
70
 
71
  static void
72
  rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc)
@@ -177,7 +183,7 @@ static bool abort_callback(void * user_data) {
177
  return false;
178
  }
179
 
180
- void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
181
  if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
182
  rwp->new_segment_callback_container->context = context;
183
  rwp->params.new_segment_callback = new_segment_callback;
@@ -203,13 +209,29 @@ void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
203
  }
204
  }
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  void
207
- rb_whisper_params_mark(ruby_whisper_params *rwp)
208
  {
 
209
  rb_whisper_callbcack_container_mark(rwp->new_segment_callback_container);
210
  rb_whisper_callbcack_container_mark(rwp->progress_callback_container);
211
  rb_whisper_callbcack_container_mark(rwp->encoder_begin_callback_container);
212
  rb_whisper_callbcack_container_mark(rwp->abort_callback_container);
 
213
  }
214
 
215
  void
@@ -218,25 +240,46 @@ ruby_whisper_params_free(ruby_whisper_params *rwp)
218
  }
219
 
220
  void
221
- rb_whisper_params_free(ruby_whisper_params *rwp)
222
  {
 
223
  // How to free user_data and callback only when not referred to by others?
224
  ruby_whisper_params_free(rwp);
225
  free(rwp);
226
  }
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  static VALUE
229
  ruby_whisper_params_allocate(VALUE klass)
230
  {
231
  ruby_whisper_params *rwp;
232
- rwp = ALLOC(ruby_whisper_params);
233
  rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
234
  rwp->diarize = false;
 
235
  rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
236
  rwp->progress_callback_container = rb_whisper_callback_container_allocate();
237
  rwp->encoder_begin_callback_container = rb_whisper_callback_container_allocate();
238
  rwp->abort_callback_container = rb_whisper_callback_container_allocate();
239
- return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
240
  }
241
 
242
  /*
@@ -249,7 +292,7 @@ static VALUE
249
  ruby_whisper_params_set_language(VALUE self, VALUE value)
250
  {
251
  ruby_whisper_params *rwp;
252
- Data_Get_Struct(self, ruby_whisper_params, rwp);
253
  if (value == Qfalse || value == Qnil) {
254
  rwp->params.language = "auto";
255
  } else {
@@ -265,7 +308,7 @@ static VALUE
265
  ruby_whisper_params_get_language(VALUE self)
266
  {
267
  ruby_whisper_params *rwp;
268
- Data_Get_Struct(self, ruby_whisper_params, rwp);
269
  if (rwp->params.language) {
270
  return rb_str_new2(rwp->params.language);
271
  } else {
@@ -502,7 +545,7 @@ static VALUE
502
  ruby_whisper_params_get_initial_prompt(VALUE self)
503
  {
504
  ruby_whisper_params *rwp;
505
- Data_Get_Struct(self, ruby_whisper_params, rwp);
506
  return rwp->params.initial_prompt == NULL ? Qnil : rb_str_new2(rwp->params.initial_prompt);
507
  }
508
  /*
@@ -513,7 +556,7 @@ static VALUE
513
  ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
514
  {
515
  ruby_whisper_params *rwp;
516
- Data_Get_Struct(self, ruby_whisper_params, rwp);
517
  rwp->params.initial_prompt = StringValueCStr(value);
518
  return value;
519
  }
@@ -527,7 +570,7 @@ static VALUE
527
  ruby_whisper_params_get_diarize(VALUE self)
528
  {
529
  ruby_whisper_params *rwp;
530
- Data_Get_Struct(self, ruby_whisper_params, rwp);
531
  if (rwp->diarize) {
532
  return Qtrue;
533
  } else {
@@ -542,7 +585,7 @@ static VALUE
542
  ruby_whisper_params_set_diarize(VALUE self, VALUE value)
543
  {
544
  ruby_whisper_params *rwp;
545
- Data_Get_Struct(self, ruby_whisper_params, rwp);
546
  if (value == Qfalse || value == Qnil) {
547
  rwp->diarize = false;
548
  } else {
@@ -561,7 +604,7 @@ static VALUE
561
  ruby_whisper_params_get_offset(VALUE self)
562
  {
563
  ruby_whisper_params *rwp;
564
- Data_Get_Struct(self, ruby_whisper_params, rwp);
565
  return INT2NUM(rwp->params.offset_ms);
566
  }
567
  /*
@@ -572,7 +615,7 @@ static VALUE
572
  ruby_whisper_params_set_offset(VALUE self, VALUE value)
573
  {
574
  ruby_whisper_params *rwp;
575
- Data_Get_Struct(self, ruby_whisper_params, rwp);
576
  rwp->params.offset_ms = NUM2INT(value);
577
  return value;
578
  }
@@ -586,7 +629,7 @@ static VALUE
586
  ruby_whisper_params_get_duration(VALUE self)
587
  {
588
  ruby_whisper_params *rwp;
589
- Data_Get_Struct(self, ruby_whisper_params, rwp);
590
  return INT2NUM(rwp->params.duration_ms);
591
  }
592
  /*
@@ -597,7 +640,7 @@ static VALUE
597
  ruby_whisper_params_set_duration(VALUE self, VALUE value)
598
  {
599
  ruby_whisper_params *rwp;
600
- Data_Get_Struct(self, ruby_whisper_params, rwp);
601
  rwp->params.duration_ms = NUM2INT(value);
602
  return value;
603
  }
@@ -612,7 +655,7 @@ static VALUE
612
  ruby_whisper_params_get_max_text_tokens(VALUE self)
613
  {
614
  ruby_whisper_params *rwp;
615
- Data_Get_Struct(self, ruby_whisper_params, rwp);
616
  return INT2NUM(rwp->params.n_max_text_ctx);
617
  }
618
  /*
@@ -623,7 +666,7 @@ static VALUE
623
  ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value)
624
  {
625
  ruby_whisper_params *rwp;
626
- Data_Get_Struct(self, ruby_whisper_params, rwp);
627
  rwp->params.n_max_text_ctx = NUM2INT(value);
628
  return value;
629
  }
@@ -635,7 +678,7 @@ static VALUE
635
  ruby_whisper_params_get_temperature(VALUE self)
636
  {
637
  ruby_whisper_params *rwp;
638
- Data_Get_Struct(self, ruby_whisper_params, rwp);
639
  return DBL2NUM(rwp->params.temperature);
640
  }
641
  /*
@@ -646,7 +689,7 @@ static VALUE
646
  ruby_whisper_params_set_temperature(VALUE self, VALUE value)
647
  {
648
  ruby_whisper_params *rwp;
649
- Data_Get_Struct(self, ruby_whisper_params, rwp);
650
  rwp->params.temperature = RFLOAT_VALUE(value);
651
  return value;
652
  }
@@ -660,7 +703,7 @@ static VALUE
660
  ruby_whisper_params_get_max_initial_ts(VALUE self)
661
  {
662
  ruby_whisper_params *rwp;
663
- Data_Get_Struct(self, ruby_whisper_params, rwp);
664
  return DBL2NUM(rwp->params.max_initial_ts);
665
  }
666
  /*
@@ -671,7 +714,7 @@ static VALUE
671
  ruby_whisper_params_set_max_initial_ts(VALUE self, VALUE value)
672
  {
673
  ruby_whisper_params *rwp;
674
- Data_Get_Struct(self, ruby_whisper_params, rwp);
675
  rwp->params.max_initial_ts = RFLOAT_VALUE(value);
676
  return value;
677
  }
@@ -683,7 +726,7 @@ static VALUE
683
  ruby_whisper_params_get_length_penalty(VALUE self)
684
  {
685
  ruby_whisper_params *rwp;
686
- Data_Get_Struct(self, ruby_whisper_params, rwp);
687
  return DBL2NUM(rwp->params.length_penalty);
688
  }
689
  /*
@@ -694,7 +737,7 @@ static VALUE
694
  ruby_whisper_params_set_length_penalty(VALUE self, VALUE value)
695
  {
696
  ruby_whisper_params *rwp;
697
- Data_Get_Struct(self, ruby_whisper_params, rwp);
698
  rwp->params.length_penalty = RFLOAT_VALUE(value);
699
  return value;
700
  }
@@ -706,7 +749,7 @@ static VALUE
706
  ruby_whisper_params_get_temperature_inc(VALUE self)
707
  {
708
  ruby_whisper_params *rwp;
709
- Data_Get_Struct(self, ruby_whisper_params, rwp);
710
  return DBL2NUM(rwp->params.temperature_inc);
711
  }
712
  /*
@@ -717,7 +760,7 @@ static VALUE
717
  ruby_whisper_params_set_temperature_inc(VALUE self, VALUE value)
718
  {
719
  ruby_whisper_params *rwp;
720
- Data_Get_Struct(self, ruby_whisper_params, rwp);
721
  rwp->params.temperature_inc = RFLOAT_VALUE(value);
722
  return value;
723
  }
@@ -731,7 +774,7 @@ static VALUE
731
  ruby_whisper_params_get_entropy_thold(VALUE self)
732
  {
733
  ruby_whisper_params *rwp;
734
- Data_Get_Struct(self, ruby_whisper_params, rwp);
735
  return DBL2NUM(rwp->params.entropy_thold);
736
  }
737
  /*
@@ -742,7 +785,7 @@ static VALUE
742
  ruby_whisper_params_set_entropy_thold(VALUE self, VALUE value)
743
  {
744
  ruby_whisper_params *rwp;
745
- Data_Get_Struct(self, ruby_whisper_params, rwp);
746
  rwp->params.entropy_thold = RFLOAT_VALUE(value);
747
  return value;
748
  }
@@ -754,7 +797,7 @@ static VALUE
754
  ruby_whisper_params_get_logprob_thold(VALUE self)
755
  {
756
  ruby_whisper_params *rwp;
757
- Data_Get_Struct(self, ruby_whisper_params, rwp);
758
  return DBL2NUM(rwp->params.logprob_thold);
759
  }
760
  /*
@@ -765,7 +808,7 @@ static VALUE
765
  ruby_whisper_params_set_logprob_thold(VALUE self, VALUE value)
766
  {
767
  ruby_whisper_params *rwp;
768
- Data_Get_Struct(self, ruby_whisper_params, rwp);
769
  rwp->params.logprob_thold = RFLOAT_VALUE(value);
770
  return value;
771
  }
@@ -777,7 +820,7 @@ static VALUE
777
  ruby_whisper_params_get_no_speech_thold(VALUE self)
778
  {
779
  ruby_whisper_params *rwp;
780
- Data_Get_Struct(self, ruby_whisper_params, rwp);
781
  return DBL2NUM(rwp->params.no_speech_thold);
782
  }
783
  /*
@@ -788,7 +831,7 @@ static VALUE
788
  ruby_whisper_params_set_no_speech_thold(VALUE self, VALUE value)
789
  {
790
  ruby_whisper_params *rwp;
791
- Data_Get_Struct(self, ruby_whisper_params, rwp);
792
  rwp->params.no_speech_thold = RFLOAT_VALUE(value);
793
  return value;
794
  }
@@ -796,7 +839,7 @@ static VALUE
796
  ruby_whisper_params_get_new_segment_callback(VALUE self)
797
  {
798
  ruby_whisper_params *rwp;
799
- Data_Get_Struct(self, ruby_whisper_params, rwp);
800
  return rwp->new_segment_callback_container->callback;
801
  }
802
  /*
@@ -813,7 +856,7 @@ static VALUE
813
  ruby_whisper_params_set_new_segment_callback(VALUE self, VALUE value)
814
  {
815
  ruby_whisper_params *rwp;
816
- Data_Get_Struct(self, ruby_whisper_params, rwp);
817
  rwp->new_segment_callback_container->callback = value;
818
  return value;
819
  }
@@ -821,7 +864,7 @@ static VALUE
821
  ruby_whisper_params_get_new_segment_callback_user_data(VALUE self)
822
  {
823
  ruby_whisper_params *rwp;
824
- Data_Get_Struct(self, ruby_whisper_params, rwp);
825
  return rwp->new_segment_callback_container->user_data;
826
  }
827
  /*
@@ -834,7 +877,7 @@ static VALUE
834
  ruby_whisper_params_set_new_segment_callback_user_data(VALUE self, VALUE value)
835
  {
836
  ruby_whisper_params *rwp;
837
- Data_Get_Struct(self, ruby_whisper_params, rwp);
838
  rwp->new_segment_callback_container->user_data = value;
839
  return value;
840
  }
@@ -842,7 +885,7 @@ static VALUE
842
  ruby_whisper_params_get_progress_callback(VALUE self)
843
  {
844
  ruby_whisper_params *rwp;
845
- Data_Get_Struct(self, ruby_whisper_params, rwp);
846
  return rwp->progress_callback_container->callback;
847
  }
848
  /*
@@ -861,7 +904,7 @@ static VALUE
861
  ruby_whisper_params_set_progress_callback(VALUE self, VALUE value)
862
  {
863
  ruby_whisper_params *rwp;
864
- Data_Get_Struct(self, ruby_whisper_params, rwp);
865
  rwp->progress_callback_container->callback = value;
866
  return value;
867
  }
@@ -869,7 +912,7 @@ static VALUE
869
  ruby_whisper_params_get_progress_callback_user_data(VALUE self)
870
  {
871
  ruby_whisper_params *rwp;
872
- Data_Get_Struct(self, ruby_whisper_params, rwp);
873
  return rwp->progress_callback_container->user_data;
874
  }
875
  /*
@@ -882,7 +925,7 @@ static VALUE
882
  ruby_whisper_params_set_progress_callback_user_data(VALUE self, VALUE value)
883
  {
884
  ruby_whisper_params *rwp;
885
- Data_Get_Struct(self, ruby_whisper_params, rwp);
886
  rwp->progress_callback_container->user_data = value;
887
  return value;
888
  }
@@ -891,7 +934,7 @@ static VALUE
891
  ruby_whisper_params_get_encoder_begin_callback(VALUE self)
892
  {
893
  ruby_whisper_params *rwp;
894
- Data_Get_Struct(self, ruby_whisper_params, rwp);
895
  return rwp->encoder_begin_callback_container->callback;
896
  }
897
 
@@ -909,7 +952,7 @@ static VALUE
909
  ruby_whisper_params_set_encoder_begin_callback(VALUE self, VALUE value)
910
  {
911
  ruby_whisper_params *rwp;
912
- Data_Get_Struct(self, ruby_whisper_params, rwp);
913
  rwp->encoder_begin_callback_container->callback = value;
914
  return value;
915
  }
@@ -918,7 +961,7 @@ static VALUE
918
  ruby_whisper_params_get_encoder_begin_callback_user_data(VALUE self)
919
  {
920
  ruby_whisper_params *rwp;
921
- Data_Get_Struct(self, ruby_whisper_params, rwp);
922
  return rwp->encoder_begin_callback_container->user_data;
923
  }
924
 
@@ -932,7 +975,7 @@ static VALUE
932
  ruby_whisper_params_set_encoder_begin_callback_user_data(VALUE self, VALUE value)
933
  {
934
  ruby_whisper_params *rwp;
935
- Data_Get_Struct(self, ruby_whisper_params, rwp);
936
  rwp->encoder_begin_callback_container->user_data = value;
937
  return value;
938
  }
@@ -941,7 +984,7 @@ static VALUE
941
  ruby_whisper_params_get_abort_callback(VALUE self)
942
  {
943
  ruby_whisper_params *rwp;
944
- Data_Get_Struct(self, ruby_whisper_params, rwp);
945
  return rwp->abort_callback_container->callback;
946
  }
947
  /*
@@ -958,7 +1001,7 @@ static VALUE
958
  ruby_whisper_params_set_abort_callback(VALUE self, VALUE value)
959
  {
960
  ruby_whisper_params *rwp;
961
- Data_Get_Struct(self, ruby_whisper_params, rwp);
962
  rwp->abort_callback_container->callback = value;
963
  return value;
964
  }
@@ -966,7 +1009,7 @@ static VALUE
966
  ruby_whisper_params_get_abort_callback_user_data(VALUE self)
967
  {
968
  ruby_whisper_params *rwp;
969
- Data_Get_Struct(self, ruby_whisper_params, rwp);
970
  return rwp->abort_callback_container->user_data;
971
  }
972
  /*
@@ -979,11 +1022,74 @@ static VALUE
979
  ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value)
980
  {
981
  ruby_whisper_params *rwp;
982
- Data_Get_Struct(self, ruby_whisper_params, rwp);
983
  rwp->abort_callback_container->user_data = value;
984
  return value;
985
  }
986
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
987
  #define SET_PARAM_IF_SAME(param_name) \
988
  if (id == id_ ## param_name) { \
989
  ruby_whisper_params_set_ ## param_name(self, value); \
@@ -993,7 +1099,6 @@ ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value)
993
  static VALUE
994
  ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
995
  {
996
-
997
  VALUE kw_hash;
998
  VALUE values[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT] = {Qundef};
999
  VALUE value;
@@ -1007,7 +1112,7 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
1007
  }
1008
 
1009
  rb_get_kwargs(kw_hash, param_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, values);
1010
- Data_Get_Struct(self, ruby_whisper_params, rwp);
1011
 
1012
  for (i = 0; i < RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT; i++) {
1013
  id = param_names[i];
@@ -1050,6 +1155,9 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
1050
  SET_PARAM_IF_SAME(encoder_begin_callback_user_data)
1051
  SET_PARAM_IF_SAME(abort_callback)
1052
  SET_PARAM_IF_SAME(abort_callback_user_data)
 
 
 
1053
  }
1054
  }
1055
 
@@ -1071,10 +1179,10 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
1071
  static VALUE
1072
  ruby_whisper_params_on_new_segment(VALUE self)
1073
  {
1074
- ruby_whisper_params *rws;
1075
- Data_Get_Struct(self, ruby_whisper_params, rws);
1076
  const VALUE blk = rb_block_proc();
1077
- rb_ary_push(rws->new_segment_callback_container->callbacks, blk);
1078
  return Qnil;
1079
  }
1080
 
@@ -1091,10 +1199,10 @@ ruby_whisper_params_on_new_segment(VALUE self)
1091
  static VALUE
1092
  ruby_whisper_params_on_progress(VALUE self)
1093
  {
1094
- ruby_whisper_params *rws;
1095
- Data_Get_Struct(self, ruby_whisper_params, rws);
1096
  const VALUE blk = rb_block_proc();
1097
- rb_ary_push(rws->progress_callback_container->callbacks, blk);
1098
  return Qnil;
1099
  }
1100
 
@@ -1111,10 +1219,10 @@ ruby_whisper_params_on_progress(VALUE self)
1111
  static VALUE
1112
  ruby_whisper_params_on_encoder_begin(VALUE self)
1113
  {
1114
- ruby_whisper_params *rws;
1115
- Data_Get_Struct(self, ruby_whisper_params, rws);
1116
  const VALUE blk = rb_block_proc();
1117
- rb_ary_push(rws->encoder_begin_callback_container->callbacks, blk);
1118
  return Qnil;
1119
  }
1120
 
@@ -1135,10 +1243,10 @@ ruby_whisper_params_on_encoder_begin(VALUE self)
1135
  static VALUE
1136
  ruby_whisper_params_abort_on(VALUE self)
1137
  {
1138
- ruby_whisper_params *rws;
1139
- Data_Get_Struct(self, ruby_whisper_params, rws);
1140
  const VALUE blk = rb_block_proc();
1141
- rb_ary_push(rws->abort_callback_container->callbacks, blk);
1142
  return Qnil;
1143
  }
1144
 
@@ -1182,6 +1290,9 @@ init_ruby_whisper_params(VALUE *mWhisper)
1182
  DEFINE_PARAM(encoder_begin_callback_user_data, 29)
1183
  DEFINE_PARAM(abort_callback, 30)
1184
  DEFINE_PARAM(abort_callback_user_data, 31)
 
 
 
1185
 
1186
  rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0);
1187
  rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0);
 
3
 
4
  #define BOOL_PARAMS_SETTER(self, prop, value) \
5
  ruby_whisper_params *rwp; \
6
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
7
  if (value == Qfalse || value == Qnil) { \
8
  rwp->params.prop = false; \
9
  } else { \
 
13
 
14
  #define BOOL_PARAMS_GETTER(self, prop) \
15
  ruby_whisper_params *rwp; \
16
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
17
  if (rwp->params.prop) { \
18
  return Qtrue; \
19
  } else { \
 
26
  rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \
27
  rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1);
28
 
29
+ #define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 35
30
 
31
  extern VALUE cParams;
32
+ extern VALUE cVADParams;
33
 
34
  extern ID id_call;
35
 
36
+ extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
37
  extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
38
+ extern const rb_data_type_t ruby_whisper_vad_params_type;
39
 
40
  static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
41
  static ID id_language;
 
70
  static ID id_encoder_begin_callback_user_data;
71
  static ID id_abort_callback;
72
  static ID id_abort_callback_user_data;
73
+ static ID id_vad;
74
+ static ID id_vad_model_path;
75
+ static ID id_vad_params;
76
 
77
  static void
78
  rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc)
 
183
  return false;
184
  }
185
 
186
+ static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
187
  if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
188
  rwp->new_segment_callback_container->context = context;
189
  rwp->params.new_segment_callback = new_segment_callback;
 
209
  }
210
  }
211
 
212
+ static void set_vad_params(ruby_whisper_params *rwp)
213
+ {
214
+ ruby_whisper_vad_params * rwvp;
215
+ TypedData_Get_Struct(rwp->vad_params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
216
+ rwp->params.vad_params = rwvp->params;
217
+ }
218
+
219
+ void
220
+ prepare_transcription(ruby_whisper_params *rwp, VALUE *context)
221
+ {
222
+ register_callbacks(rwp, context);
223
+ set_vad_params(rwp);
224
+ }
225
+
226
  void
227
+ rb_whisper_params_mark(void *p)
228
  {
229
+ ruby_whisper_params *rwp = (ruby_whisper_params *)p;
230
  rb_whisper_callbcack_container_mark(rwp->new_segment_callback_container);
231
  rb_whisper_callbcack_container_mark(rwp->progress_callback_container);
232
  rb_whisper_callbcack_container_mark(rwp->encoder_begin_callback_container);
233
  rb_whisper_callbcack_container_mark(rwp->abort_callback_container);
234
+ rb_gc_mark(rwp->vad_params);
235
  }
236
 
237
  void
 
240
  }
241
 
242
  void
243
+ rb_whisper_params_free(void *p)
244
  {
245
+ ruby_whisper_params *rwp = (ruby_whisper_params *)p;
246
  // How to free user_data and callback only when not referred to by others?
247
  ruby_whisper_params_free(rwp);
248
  free(rwp);
249
  }
250
 
251
+ static size_t
252
+ ruby_whisper_params_memsize(const void *p)
253
+ {
254
+ const ruby_whisper_params *rwp = (const ruby_whisper_params *)p;
255
+
256
+ return sizeof(ruby_whisper_params) + sizeof(rwp->params) + sizeof(rwp->vad_params);
257
+ }
258
+
259
+ const rb_data_type_t ruby_whisper_params_type = {
260
+ "ruby_whisper_params",
261
+ {
262
+ rb_whisper_params_mark,
263
+ rb_whisper_params_free,
264
+ ruby_whisper_params_memsize,
265
+ },
266
+ 0, 0,
267
+ 0
268
+ };
269
+
270
  static VALUE
271
  ruby_whisper_params_allocate(VALUE klass)
272
  {
273
  ruby_whisper_params *rwp;
274
+ VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
275
  rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
276
  rwp->diarize = false;
277
+ rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
278
  rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
279
  rwp->progress_callback_container = rb_whisper_callback_container_allocate();
280
  rwp->encoder_begin_callback_container = rb_whisper_callback_container_allocate();
281
  rwp->abort_callback_container = rb_whisper_callback_container_allocate();
282
+ return obj;
283
  }
284
 
285
  /*
 
292
  ruby_whisper_params_set_language(VALUE self, VALUE value)
293
  {
294
  ruby_whisper_params *rwp;
295
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
296
  if (value == Qfalse || value == Qnil) {
297
  rwp->params.language = "auto";
298
  } else {
 
308
  ruby_whisper_params_get_language(VALUE self)
309
  {
310
  ruby_whisper_params *rwp;
311
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
312
  if (rwp->params.language) {
313
  return rb_str_new2(rwp->params.language);
314
  } else {
 
545
  ruby_whisper_params_get_initial_prompt(VALUE self)
546
  {
547
  ruby_whisper_params *rwp;
548
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
549
  return rwp->params.initial_prompt == NULL ? Qnil : rb_str_new2(rwp->params.initial_prompt);
550
  }
551
  /*
 
556
  ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
557
  {
558
  ruby_whisper_params *rwp;
559
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
560
  rwp->params.initial_prompt = StringValueCStr(value);
561
  return value;
562
  }
 
570
  ruby_whisper_params_get_diarize(VALUE self)
571
  {
572
  ruby_whisper_params *rwp;
573
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
574
  if (rwp->diarize) {
575
  return Qtrue;
576
  } else {
 
585
  ruby_whisper_params_set_diarize(VALUE self, VALUE value)
586
  {
587
  ruby_whisper_params *rwp;
588
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
589
  if (value == Qfalse || value == Qnil) {
590
  rwp->diarize = false;
591
  } else {
 
604
  ruby_whisper_params_get_offset(VALUE self)
605
  {
606
  ruby_whisper_params *rwp;
607
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
608
  return INT2NUM(rwp->params.offset_ms);
609
  }
610
  /*
 
615
  ruby_whisper_params_set_offset(VALUE self, VALUE value)
616
  {
617
  ruby_whisper_params *rwp;
618
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
619
  rwp->params.offset_ms = NUM2INT(value);
620
  return value;
621
  }
 
629
  ruby_whisper_params_get_duration(VALUE self)
630
  {
631
  ruby_whisper_params *rwp;
632
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
633
  return INT2NUM(rwp->params.duration_ms);
634
  }
635
  /*
 
640
  ruby_whisper_params_set_duration(VALUE self, VALUE value)
641
  {
642
  ruby_whisper_params *rwp;
643
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
644
  rwp->params.duration_ms = NUM2INT(value);
645
  return value;
646
  }
 
655
  ruby_whisper_params_get_max_text_tokens(VALUE self)
656
  {
657
  ruby_whisper_params *rwp;
658
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
659
  return INT2NUM(rwp->params.n_max_text_ctx);
660
  }
661
  /*
 
666
  ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value)
667
  {
668
  ruby_whisper_params *rwp;
669
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
670
  rwp->params.n_max_text_ctx = NUM2INT(value);
671
  return value;
672
  }
 
678
  ruby_whisper_params_get_temperature(VALUE self)
679
  {
680
  ruby_whisper_params *rwp;
681
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
682
  return DBL2NUM(rwp->params.temperature);
683
  }
684
  /*
 
689
  ruby_whisper_params_set_temperature(VALUE self, VALUE value)
690
  {
691
  ruby_whisper_params *rwp;
692
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
693
  rwp->params.temperature = RFLOAT_VALUE(value);
694
  return value;
695
  }
 
703
  ruby_whisper_params_get_max_initial_ts(VALUE self)
704
  {
705
  ruby_whisper_params *rwp;
706
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
707
  return DBL2NUM(rwp->params.max_initial_ts);
708
  }
709
  /*
 
714
  ruby_whisper_params_set_max_initial_ts(VALUE self, VALUE value)
715
  {
716
  ruby_whisper_params *rwp;
717
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
718
  rwp->params.max_initial_ts = RFLOAT_VALUE(value);
719
  return value;
720
  }
 
726
  ruby_whisper_params_get_length_penalty(VALUE self)
727
  {
728
  ruby_whisper_params *rwp;
729
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
730
  return DBL2NUM(rwp->params.length_penalty);
731
  }
732
  /*
 
737
  ruby_whisper_params_set_length_penalty(VALUE self, VALUE value)
738
  {
739
  ruby_whisper_params *rwp;
740
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
741
  rwp->params.length_penalty = RFLOAT_VALUE(value);
742
  return value;
743
  }
 
749
  ruby_whisper_params_get_temperature_inc(VALUE self)
750
  {
751
  ruby_whisper_params *rwp;
752
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
753
  return DBL2NUM(rwp->params.temperature_inc);
754
  }
755
  /*
 
760
  ruby_whisper_params_set_temperature_inc(VALUE self, VALUE value)
761
  {
762
  ruby_whisper_params *rwp;
763
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
764
  rwp->params.temperature_inc = RFLOAT_VALUE(value);
765
  return value;
766
  }
 
774
  ruby_whisper_params_get_entropy_thold(VALUE self)
775
  {
776
  ruby_whisper_params *rwp;
777
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
778
  return DBL2NUM(rwp->params.entropy_thold);
779
  }
780
  /*
 
785
  ruby_whisper_params_set_entropy_thold(VALUE self, VALUE value)
786
  {
787
  ruby_whisper_params *rwp;
788
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
789
  rwp->params.entropy_thold = RFLOAT_VALUE(value);
790
  return value;
791
  }
 
797
  ruby_whisper_params_get_logprob_thold(VALUE self)
798
  {
799
  ruby_whisper_params *rwp;
800
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
801
  return DBL2NUM(rwp->params.logprob_thold);
802
  }
803
  /*
 
808
  ruby_whisper_params_set_logprob_thold(VALUE self, VALUE value)
809
  {
810
  ruby_whisper_params *rwp;
811
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
812
  rwp->params.logprob_thold = RFLOAT_VALUE(value);
813
  return value;
814
  }
 
820
  ruby_whisper_params_get_no_speech_thold(VALUE self)
821
  {
822
  ruby_whisper_params *rwp;
823
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
824
  return DBL2NUM(rwp->params.no_speech_thold);
825
  }
826
  /*
 
831
  ruby_whisper_params_set_no_speech_thold(VALUE self, VALUE value)
832
  {
833
  ruby_whisper_params *rwp;
834
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
835
  rwp->params.no_speech_thold = RFLOAT_VALUE(value);
836
  return value;
837
  }
 
839
  ruby_whisper_params_get_new_segment_callback(VALUE self)
840
  {
841
  ruby_whisper_params *rwp;
842
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
843
  return rwp->new_segment_callback_container->callback;
844
  }
845
  /*
 
856
  ruby_whisper_params_set_new_segment_callback(VALUE self, VALUE value)
857
  {
858
  ruby_whisper_params *rwp;
859
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
860
  rwp->new_segment_callback_container->callback = value;
861
  return value;
862
  }
 
864
  ruby_whisper_params_get_new_segment_callback_user_data(VALUE self)
865
  {
866
  ruby_whisper_params *rwp;
867
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
868
  return rwp->new_segment_callback_container->user_data;
869
  }
870
  /*
 
877
  ruby_whisper_params_set_new_segment_callback_user_data(VALUE self, VALUE value)
878
  {
879
  ruby_whisper_params *rwp;
880
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
881
  rwp->new_segment_callback_container->user_data = value;
882
  return value;
883
  }
 
885
  ruby_whisper_params_get_progress_callback(VALUE self)
886
  {
887
  ruby_whisper_params *rwp;
888
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
889
  return rwp->progress_callback_container->callback;
890
  }
891
  /*
 
904
  ruby_whisper_params_set_progress_callback(VALUE self, VALUE value)
905
  {
906
  ruby_whisper_params *rwp;
907
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
908
  rwp->progress_callback_container->callback = value;
909
  return value;
910
  }
 
912
  ruby_whisper_params_get_progress_callback_user_data(VALUE self)
913
  {
914
  ruby_whisper_params *rwp;
915
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
916
  return rwp->progress_callback_container->user_data;
917
  }
918
  /*
 
925
  ruby_whisper_params_set_progress_callback_user_data(VALUE self, VALUE value)
926
  {
927
  ruby_whisper_params *rwp;
928
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
929
  rwp->progress_callback_container->user_data = value;
930
  return value;
931
  }
 
934
  ruby_whisper_params_get_encoder_begin_callback(VALUE self)
935
  {
936
  ruby_whisper_params *rwp;
937
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
938
  return rwp->encoder_begin_callback_container->callback;
939
  }
940
 
 
952
  ruby_whisper_params_set_encoder_begin_callback(VALUE self, VALUE value)
953
  {
954
  ruby_whisper_params *rwp;
955
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
956
  rwp->encoder_begin_callback_container->callback = value;
957
  return value;
958
  }
 
961
  ruby_whisper_params_get_encoder_begin_callback_user_data(VALUE self)
962
  {
963
  ruby_whisper_params *rwp;
964
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
965
  return rwp->encoder_begin_callback_container->user_data;
966
  }
967
 
 
975
  ruby_whisper_params_set_encoder_begin_callback_user_data(VALUE self, VALUE value)
976
  {
977
  ruby_whisper_params *rwp;
978
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
979
  rwp->encoder_begin_callback_container->user_data = value;
980
  return value;
981
  }
 
984
  ruby_whisper_params_get_abort_callback(VALUE self)
985
  {
986
  ruby_whisper_params *rwp;
987
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
988
  return rwp->abort_callback_container->callback;
989
  }
990
  /*
 
1001
  ruby_whisper_params_set_abort_callback(VALUE self, VALUE value)
1002
  {
1003
  ruby_whisper_params *rwp;
1004
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1005
  rwp->abort_callback_container->callback = value;
1006
  return value;
1007
  }
 
1009
  ruby_whisper_params_get_abort_callback_user_data(VALUE self)
1010
  {
1011
  ruby_whisper_params *rwp;
1012
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1013
  return rwp->abort_callback_container->user_data;
1014
  }
1015
  /*
 
1022
  ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value)
1023
  {
1024
  ruby_whisper_params *rwp;
1025
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1026
  rwp->abort_callback_container->user_data = value;
1027
  return value;
1028
  }
1029
 
1030
+ /*
1031
+ * call-seq:
1032
+ * vad = use_vad -> use_vad
1033
+ */
1034
+ static VALUE
1035
+ ruby_whisper_params_get_vad(VALUE self)
1036
+ {
1037
+ BOOL_PARAMS_GETTER(self, vad)
1038
+ }
1039
+
1040
+ static VALUE
1041
+ ruby_whisper_params_set_vad(VALUE self, VALUE value)
1042
+ {
1043
+ BOOL_PARAMS_SETTER(self, vad, value)
1044
+ }
1045
+
1046
+ /*
1047
+ * call-seq:
1048
+ * vad_model_path = model_path -> model_path
1049
+ */
1050
+ static VALUE
1051
+ ruby_whisper_params_set_vad_model_path(VALUE self, VALUE value)
1052
+ {
1053
+ ruby_whisper_params *rwp;
1054
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1055
+ if (NIL_P(value)) {
1056
+ rwp->params.vad_model_path = NULL;
1057
+ return value;
1058
+ }
1059
+ VALUE path = ruby_whisper_normalize_model_path(value);
1060
+ rwp->params.vad_model_path = StringValueCStr(path);
1061
+ return value;
1062
+ }
1063
+
1064
+ static VALUE
1065
+ ruby_whisper_params_get_vad_model_path(VALUE self)
1066
+ {
1067
+ ruby_whisper_params *rwp;
1068
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1069
+ return rwp->params.vad_model_path == NULL ? Qnil : rb_str_new2(rwp->params.vad_model_path);
1070
+ }
1071
+
1072
+ /*
1073
+ * call-seq:
1074
+ * vad_params = params -> params
1075
+ */
1076
+ static VALUE
1077
+ ruby_whisper_params_set_vad_params(VALUE self, VALUE value)
1078
+ {
1079
+ ruby_whisper_params *rwp;
1080
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1081
+ rwp->vad_params = value;
1082
+ return value;
1083
+ }
1084
+
1085
+ static VALUE
1086
+ ruby_whisper_params_get_vad_params(VALUE self)
1087
+ {
1088
+ ruby_whisper_params *rwp;
1089
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1090
+ return rwp->vad_params;
1091
+ }
1092
+
1093
  #define SET_PARAM_IF_SAME(param_name) \
1094
  if (id == id_ ## param_name) { \
1095
  ruby_whisper_params_set_ ## param_name(self, value); \
 
1099
  static VALUE
1100
  ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
1101
  {
 
1102
  VALUE kw_hash;
1103
  VALUE values[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT] = {Qundef};
1104
  VALUE value;
 
1112
  }
1113
 
1114
  rb_get_kwargs(kw_hash, param_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, values);
1115
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1116
 
1117
  for (i = 0; i < RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT; i++) {
1118
  id = param_names[i];
 
1155
  SET_PARAM_IF_SAME(encoder_begin_callback_user_data)
1156
  SET_PARAM_IF_SAME(abort_callback)
1157
  SET_PARAM_IF_SAME(abort_callback_user_data)
1158
+ SET_PARAM_IF_SAME(vad)
1159
+ SET_PARAM_IF_SAME(vad_model_path)
1160
+ SET_PARAM_IF_SAME(vad_params)
1161
  }
1162
  }
1163
 
 
1179
  static VALUE
1180
  ruby_whisper_params_on_new_segment(VALUE self)
1181
  {
1182
+ ruby_whisper_params *rwp;
1183
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1184
  const VALUE blk = rb_block_proc();
1185
+ rb_ary_push(rwp->new_segment_callback_container->callbacks, blk);
1186
  return Qnil;
1187
  }
1188
 
 
1199
  static VALUE
1200
  ruby_whisper_params_on_progress(VALUE self)
1201
  {
1202
+ ruby_whisper_params *rwp;
1203
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1204
  const VALUE blk = rb_block_proc();
1205
+ rb_ary_push(rwp->progress_callback_container->callbacks, blk);
1206
  return Qnil;
1207
  }
1208
 
 
1219
  static VALUE
1220
  ruby_whisper_params_on_encoder_begin(VALUE self)
1221
  {
1222
+ ruby_whisper_params *rwp;
1223
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1224
  const VALUE blk = rb_block_proc();
1225
+ rb_ary_push(rwp->encoder_begin_callback_container->callbacks, blk);
1226
  return Qnil;
1227
  }
1228
 
 
1243
  static VALUE
1244
  ruby_whisper_params_abort_on(VALUE self)
1245
  {
1246
+ ruby_whisper_params *rwp;
1247
+ TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
1248
  const VALUE blk = rb_block_proc();
1249
+ rb_ary_push(rwp->abort_callback_container->callbacks, blk);
1250
  return Qnil;
1251
  }
1252
 
 
1290
  DEFINE_PARAM(encoder_begin_callback_user_data, 29)
1291
  DEFINE_PARAM(abort_callback, 30)
1292
  DEFINE_PARAM(abort_callback_user_data, 31)
1293
+ DEFINE_PARAM(vad, 32)
1294
+ DEFINE_PARAM(vad_model_path, 33)
1295
+ DEFINE_PARAM(vad_params, 34)
1296
 
1297
  rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0);
1298
  rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0);
bindings/ruby/ext/ruby_whisper_segment.c CHANGED
@@ -1,20 +1,40 @@
1
  #include <ruby.h>
2
  #include "ruby_whisper.h"
3
 
 
 
4
  extern VALUE cSegment;
5
 
6
  static void
7
- rb_whisper_segment_mark(ruby_whisper_segment *rws)
8
  {
 
9
  rb_gc_mark(rws->context);
10
  }
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  VALUE
13
  ruby_whisper_segment_allocate(VALUE klass)
14
  {
15
  ruby_whisper_segment *rws;
16
- rws = ALLOC(ruby_whisper_segment);
17
- return Data_Wrap_Struct(klass, rb_whisper_segment_mark, RUBY_DEFAULT_FREE, rws);
18
  }
19
 
20
  VALUE
@@ -22,7 +42,7 @@ rb_whisper_segment_initialize(VALUE context, int index)
22
  {
23
  ruby_whisper_segment *rws;
24
  const VALUE segment = ruby_whisper_segment_allocate(cSegment);
25
- Data_Get_Struct(segment, ruby_whisper_segment, rws);
26
  rws->context = context;
27
  rws->index = index;
28
  return segment;
@@ -38,9 +58,9 @@ static VALUE
38
  ruby_whisper_segment_get_start_time(VALUE self)
39
  {
40
  ruby_whisper_segment *rws;
41
- Data_Get_Struct(self, ruby_whisper_segment, rws);
42
  ruby_whisper *rw;
43
- Data_Get_Struct(rws->context, ruby_whisper, rw);
44
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
45
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
46
  return INT2NUM(t0 * 10);
@@ -56,9 +76,9 @@ static VALUE
56
  ruby_whisper_segment_get_end_time(VALUE self)
57
  {
58
  ruby_whisper_segment *rws;
59
- Data_Get_Struct(self, ruby_whisper_segment, rws);
60
  ruby_whisper *rw;
61
- Data_Get_Struct(rws->context, ruby_whisper, rw);
62
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
63
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
64
  return INT2NUM(t1 * 10);
@@ -74,9 +94,9 @@ static VALUE
74
  ruby_whisper_segment_get_speaker_turn_next(VALUE self)
75
  {
76
  ruby_whisper_segment *rws;
77
- Data_Get_Struct(self, ruby_whisper_segment, rws);
78
  ruby_whisper *rw;
79
- Data_Get_Struct(rws->context, ruby_whisper, rw);
80
  return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
81
  }
82
 
@@ -88,9 +108,9 @@ static VALUE
88
  ruby_whisper_segment_get_text(VALUE self)
89
  {
90
  ruby_whisper_segment *rws;
91
- Data_Get_Struct(self, ruby_whisper_segment, rws);
92
  ruby_whisper *rw;
93
- Data_Get_Struct(rws->context, ruby_whisper, rw);
94
  const char * text = whisper_full_get_segment_text(rw->context, rws->index);
95
  return rb_str_new2(text);
96
  }
@@ -103,9 +123,9 @@ static VALUE
103
  ruby_whisper_segment_get_no_speech_prob(VALUE self)
104
  {
105
  ruby_whisper_segment *rws;
106
- Data_Get_Struct(self, ruby_whisper_segment, rws);
107
  ruby_whisper *rw;
108
- Data_Get_Struct(rws->context, ruby_whisper, rw);
109
  return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
110
  }
111
 
 
1
  #include <ruby.h>
2
  #include "ruby_whisper.h"
3
 
4
+ extern const rb_data_type_t ruby_whisper_type;
5
+
6
  extern VALUE cSegment;
7
 
8
  static void
9
+ rb_whisper_segment_mark(void *p)
10
  {
11
+ ruby_whisper_segment *rws = (ruby_whisper_segment *)p;
12
  rb_gc_mark(rws->context);
13
  }
14
 
15
+ static size_t
16
+ ruby_whisper_segment_memsize(const void *p)
17
+ {
18
+ const ruby_whisper_segment *rws = (const ruby_whisper_segment *)p;
19
+ size_t size = sizeof(rws);
20
+ if (!rws) {
21
+ return 0;
22
+ }
23
+ return size;
24
+ }
25
+
26
+ static const rb_data_type_t ruby_whisper_segment_type = {
27
+ "ruby_whisper_segment",
28
+ {rb_whisper_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_segment_memsize,},
29
+ 0, 0,
30
+ 0
31
+ };
32
+
33
  VALUE
34
  ruby_whisper_segment_allocate(VALUE klass)
35
  {
36
  ruby_whisper_segment *rws;
37
+ return TypedData_Make_Struct(klass, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
 
38
  }
39
 
40
  VALUE
 
42
  {
43
  ruby_whisper_segment *rws;
44
  const VALUE segment = ruby_whisper_segment_allocate(cSegment);
45
+ TypedData_Get_Struct(segment, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
46
  rws->context = context;
47
  rws->index = index;
48
  return segment;
 
58
  ruby_whisper_segment_get_start_time(VALUE self)
59
  {
60
  ruby_whisper_segment *rws;
61
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
62
  ruby_whisper *rw;
63
+ TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
64
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
65
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
66
  return INT2NUM(t0 * 10);
 
76
  ruby_whisper_segment_get_end_time(VALUE self)
77
  {
78
  ruby_whisper_segment *rws;
79
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
80
  ruby_whisper *rw;
81
+ TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
82
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
83
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
84
  return INT2NUM(t1 * 10);
 
94
  ruby_whisper_segment_get_speaker_turn_next(VALUE self)
95
  {
96
  ruby_whisper_segment *rws;
97
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
98
  ruby_whisper *rw;
99
+ TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
100
  return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
101
  }
102
 
 
108
  ruby_whisper_segment_get_text(VALUE self)
109
  {
110
  ruby_whisper_segment *rws;
111
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
112
  ruby_whisper *rw;
113
+ TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
114
  const char * text = whisper_full_get_segment_text(rw->context, rws->index);
115
  return rb_str_new2(text);
116
  }
 
123
  ruby_whisper_segment_get_no_speech_prob(VALUE self)
124
  {
125
  ruby_whisper_segment *rws;
126
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
127
  ruby_whisper *rw;
128
+ TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
129
  return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
130
  }
131
 
bindings/ruby/ext/ruby_whisper_transcribe.cpp CHANGED
@@ -8,11 +8,14 @@
8
  extern "C" {
9
  #endif
10
 
 
 
 
11
  extern ID id_to_s;
12
  extern ID id_call;
13
 
14
  extern void
15
- register_callbacks(ruby_whisper_params * rwp, VALUE * self);
16
 
17
  /*
18
  * transcribe a single file
@@ -34,8 +37,8 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
34
  VALUE wave_file_path, blk, params;
35
 
36
  rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
37
- Data_Get_Struct(self, ruby_whisper, rw);
38
- Data_Get_Struct(params, ruby_whisper_params, rwp);
39
 
40
  if (!rb_respond_to(wave_file_path, id_to_s)) {
41
  rb_raise(rb_eRuntimeError, "Expected file path to wave file");
@@ -61,7 +64,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
61
  // rwp->params.encoder_begin_callback_user_data = &is_aborted;
62
  // }
63
 
64
- register_callbacks(rwp, &self);
65
 
66
  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
67
  fprintf(stderr, "failed to process audio\n");
 
8
  extern "C" {
9
  #endif
10
 
11
+ extern const rb_data_type_t ruby_whisper_type;
12
+ extern const rb_data_type_t ruby_whisper_params_type;
13
+
14
  extern ID id_to_s;
15
  extern ID id_call;
16
 
17
  extern void
18
+ prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
19
 
20
  /*
21
  * transcribe a single file
 
37
  VALUE wave_file_path, blk, params;
38
 
39
  rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
40
+ TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
41
+ TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
42
 
43
  if (!rb_respond_to(wave_file_path, id_to_s)) {
44
  rb_raise(rb_eRuntimeError, "Expected file path to wave file");
 
64
  // rwp->params.encoder_begin_callback_user_data = &is_aborted;
65
  // }
66
 
67
+ prepare_transcription(rwp, &self);
68
 
69
  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
70
  fprintf(stderr, "failed to process audio\n");
bindings/ruby/ext/ruby_whisper_vad_params.c ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ruby.h>
2
+ #include "ruby_whisper.h"
3
+
4
+ #define DEFINE_PARAM(param_name, nth) \
5
+ id_ ## param_name = rb_intern(#param_name); \
6
+ param_names[nth] = id_ ## param_name; \
7
+ rb_define_method(cVADParams, #param_name, ruby_whisper_vad_params_get_ ## param_name, 0); \
8
+ rb_define_method(cVADParams, #param_name "=", ruby_whisper_vad_params_set_ ## param_name, 1);
9
+
10
+ #define NUM_PARAMS 6
11
+
12
+ extern VALUE cVADParams;
13
+
14
+ static size_t
15
+ ruby_whisper_vad_params_memsize(const void *p)
16
+ {
17
+ const struct ruby_whisper_vad_params *params = p;
18
+ size_t size = sizeof(params);
19
+ if (!params) {
20
+ return 0;
21
+ }
22
+ return size;
23
+ }
24
+
25
+ static ID param_names[NUM_PARAMS];
26
+ static ID id_threshold;
27
+ static ID id_min_speech_duration_ms;
28
+ static ID id_min_silence_duration_ms;
29
+ static ID id_max_speech_duration_s;
30
+ static ID id_speech_pad_ms;
31
+ static ID id_samples_overlap;
32
+
33
+ const rb_data_type_t ruby_whisper_vad_params_type = {
34
+ "ruby_whisper_vad_params",
35
+ {0, 0, ruby_whisper_vad_params_memsize,},
36
+ 0, 0,
37
+ 0
38
+ };
39
+
40
+ static VALUE
41
+ ruby_whisper_vad_params_s_allocate(VALUE klass)
42
+ {
43
+ ruby_whisper_vad_params *rwvp;
44
+ VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
45
+ rwvp->params = whisper_vad_default_params();
46
+ return obj;
47
+ }
48
+
49
+ /*
50
+ * Probability threshold to consider as speech.
51
+ *
52
+ * call-seq:
53
+ * threshold = th -> th
54
+ */
55
+ static VALUE
56
+ ruby_whisper_vad_params_set_threshold(VALUE self, VALUE value)
57
+ {
58
+ ruby_whisper_vad_params *rwvp;
59
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
60
+ rwvp->params.threshold = RFLOAT_VALUE(value);
61
+ return value;
62
+ }
63
+
64
+ static VALUE
65
+ ruby_whisper_vad_params_get_threshold(VALUE self)
66
+ {
67
+ ruby_whisper_vad_params *rwvp;
68
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
69
+ return DBL2NUM(rwvp->params.threshold);
70
+ }
71
+
72
+ /*
73
+ * Min duration for a valid speech segment.
74
+ *
75
+ * call-seq:
76
+ * min_speech_duration_ms = duration_ms -> duration_ms
77
+ */
78
+ static VALUE
79
+ ruby_whisper_vad_params_set_min_speech_duration_ms(VALUE self, VALUE value)
80
+ {
81
+ ruby_whisper_vad_params *rwvp;
82
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
83
+ rwvp->params.min_speech_duration_ms = NUM2INT(value);
84
+ return value;
85
+ }
86
+
87
+ static VALUE
88
+ ruby_whisper_vad_params_get_min_speech_duration_ms(VALUE self)
89
+ {
90
+ ruby_whisper_vad_params *rwvp;
91
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
92
+ return INT2NUM(rwvp->params.min_speech_duration_ms);
93
+ }
94
+
95
+ /*
96
+ * Min silence duration to consider speech as ended.
97
+ *
98
+ * call-seq:
99
+ * min_silence_duration_ms = duration_ms -> duration_ms
100
+ */
101
+ static VALUE
102
+ ruby_whisper_vad_params_set_min_silence_duration_ms(VALUE self, VALUE value)
103
+ {
104
+ ruby_whisper_vad_params *rwvp;
105
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
106
+ rwvp->params.min_silence_duration_ms = NUM2INT(value);
107
+ return value;
108
+ }
109
+
110
+ static VALUE
111
+ ruby_whisper_vad_params_get_min_silence_duration_ms(VALUE self)
112
+ {
113
+ ruby_whisper_vad_params *rwvp;
114
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
115
+ return INT2NUM(rwvp->params.min_silence_duration_ms);
116
+ }
117
+
118
+ /*
119
+ * Max duration of a speech segment before forcing a new segment.
120
+ *
121
+ * call-seq:
122
+ * max_speech_duration_s = duration_s -> duration_s
123
+ */
124
+ static VALUE
125
+ ruby_whisper_vad_params_set_max_speech_duration_s(VALUE self, VALUE value)
126
+ {
127
+ ruby_whisper_vad_params *rwvp;
128
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
129
+ rwvp->params.max_speech_duration_s = RFLOAT_VALUE(value);
130
+ return value;
131
+ }
132
+
133
+ static VALUE
134
+ ruby_whisper_vad_params_get_max_speech_duration_s(VALUE self)
135
+ {
136
+ ruby_whisper_vad_params *rwvp;
137
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
138
+ return DBL2NUM(rwvp->params.max_speech_duration_s);
139
+ }
140
+
141
+ /*
142
+ * Padding added before and after speech segments.
143
+ *
144
+ * call-seq:
145
+ * speech_pad_ms = pad_ms -> pad_ms
146
+ */
147
+ static VALUE
148
+ ruby_whisper_vad_params_set_speech_pad_ms(VALUE self, VALUE value)
149
+ {
150
+ ruby_whisper_vad_params *rwvp;
151
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
152
+ rwvp->params.speech_pad_ms = NUM2INT(value);
153
+ return value;
154
+ }
155
+
156
+ static VALUE
157
+ ruby_whisper_vad_params_get_speech_pad_ms(VALUE self)
158
+ {
159
+ ruby_whisper_vad_params *rwvp;
160
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
161
+ return INT2NUM(rwvp->params.speech_pad_ms);
162
+ }
163
+
164
+ /*
165
+ * Overlap in seconds when copying audio samples from speech segment.
166
+ *
167
+ * call-seq:
168
+ * samples_overlap = overlap -> overlap
169
+ */
170
+ static VALUE
171
+ ruby_whisper_vad_params_set_samples_overlap(VALUE self, VALUE value)
172
+ {
173
+ ruby_whisper_vad_params *rwvp;
174
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
175
+ rwvp->params.samples_overlap = RFLOAT_VALUE(value);
176
+ return value;
177
+ }
178
+
179
+ static VALUE
180
+ ruby_whisper_vad_params_get_samples_overlap(VALUE self)
181
+ {
182
+ ruby_whisper_vad_params *rwvp;
183
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
184
+ return DBL2NUM(rwvp->params.samples_overlap);
185
+ }
186
+
187
+ static VALUE
188
+ ruby_whisper_vad_params_equal(VALUE self, VALUE other)
189
+ {
190
+ ruby_whisper_vad_params *rwvp1;
191
+ ruby_whisper_vad_params *rwvp2;
192
+
193
+ if (self == other) {
194
+ return Qtrue;
195
+ }
196
+
197
+ if (!rb_obj_is_kind_of(other, cVADParams)) {
198
+ return Qfalse;
199
+ }
200
+
201
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp1);
202
+ TypedData_Get_Struct(other, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp2);
203
+
204
+ if (rwvp1->params.threshold != rwvp2->params.threshold) {
205
+ return Qfalse;
206
+ }
207
+ if (rwvp1->params.min_speech_duration_ms != rwvp2->params.min_speech_duration_ms) {
208
+ return Qfalse;
209
+ }
210
+ if (rwvp1->params.min_silence_duration_ms != rwvp2->params.min_silence_duration_ms) {
211
+ return Qfalse;
212
+ }
213
+ if (rwvp1->params.max_speech_duration_s != rwvp2->params.max_speech_duration_s) {
214
+ return Qfalse;
215
+ }
216
+ if (rwvp1->params.speech_pad_ms != rwvp2->params.speech_pad_ms) {
217
+ return Qfalse;
218
+ }
219
+ if (rwvp1->params.samples_overlap != rwvp2->params.samples_overlap) {
220
+ return Qfalse;
221
+ }
222
+
223
+ return Qtrue;
224
+ }
225
+
226
+ #define SET_PARAM_IF_SAME(param_name) \
227
+ if (id == id_ ## param_name) { \
228
+ ruby_whisper_vad_params_set_ ## param_name(self, value); \
229
+ continue; \
230
+ }
231
+
232
+ VALUE
233
+ ruby_whisper_vad_params_initialize(int argc, VALUE *argv, VALUE self)
234
+ {
235
+ VALUE kw_hash;
236
+ VALUE values[NUM_PARAMS] = {Qundef};
237
+ VALUE value;
238
+ ruby_whisper_vad_params *rwvp;
239
+ ID id;
240
+ int i;
241
+
242
+ TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
243
+
244
+ rb_scan_args_kw(RB_SCAN_ARGS_KEYWORDS, argc, argv, ":", &kw_hash);
245
+ if (NIL_P(kw_hash)) {
246
+ return self;
247
+ }
248
+
249
+ rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
250
+
251
+ for (i = 0; i < NUM_PARAMS; i++) {
252
+ id= param_names[i];
253
+ value = values[i];
254
+ if (value == Qundef) {
255
+ continue;
256
+ }
257
+ SET_PARAM_IF_SAME(threshold)
258
+ SET_PARAM_IF_SAME(min_speech_duration_ms)
259
+ SET_PARAM_IF_SAME(min_silence_duration_ms)
260
+ SET_PARAM_IF_SAME(max_speech_duration_s)
261
+ SET_PARAM_IF_SAME(speech_pad_ms)
262
+ SET_PARAM_IF_SAME(samples_overlap)
263
+ }
264
+
265
+ return self;
266
+ }
267
+
268
+ #undef SET_PARAM_IF_SAME
269
+
270
+ void
271
+ init_ruby_whisper_vad_params(VALUE *mVAD)
272
+ {
273
+ cVADParams = rb_define_class_under(*mVAD, "Params", rb_cObject);
274
+ rb_define_alloc_func(cVADParams, ruby_whisper_vad_params_s_allocate);
275
+ rb_define_method(cVADParams, "initialize", ruby_whisper_vad_params_initialize, -1);
276
+
277
+ DEFINE_PARAM(threshold, 0)
278
+ DEFINE_PARAM(min_speech_duration_ms, 1)
279
+ DEFINE_PARAM(min_silence_duration_ms, 2)
280
+ DEFINE_PARAM(max_speech_duration_s, 3)
281
+ DEFINE_PARAM(speech_pad_ms, 4)
282
+ DEFINE_PARAM(samples_overlap, 5)
283
+
284
+ rb_define_method(cVADParams, "==", ruby_whisper_vad_params_equal, 1);
285
+ }
286
+
287
+ #undef DEFINE_PARAM
288
+ #undef NUM_PARAMS
bindings/ruby/lib/whisper/model/uri.rb CHANGED
@@ -165,6 +165,12 @@ module Whisper
165
  models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
166
  }
167
 
 
 
 
 
 
 
168
  class << self
169
  attr_reader :pre_converted_models
170
  end
 
165
  models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
166
  }
167
 
168
+ %w[
169
+ silero-v5.1.2
170
+ ].each do |name|
171
+ @pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
172
+ end
173
+
174
  class << self
175
  attr_reader :pre_converted_models
176
  end
bindings/ruby/sig/whisper.rbs CHANGED
@@ -150,7 +150,10 @@ module Whisper
150
  ?encoder_begin_callback: encoder_begin_callback,
151
  ?encoder_begin_callback_user_data: Object,
152
  ?abort_callback: abort_callback,
153
- ?abort_callback_user_data: Object
 
 
 
154
  ) -> instance
155
 
156
  # params.language = "auto" | "en", etc...
@@ -338,6 +341,20 @@ module Whisper
338
 
339
  def abort_callback_user_data: () -> Object
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  # Hook called on new segment. Yields each Whisper::Segment.
342
  #
343
  # whisper.on_new_segment do |segment|
@@ -406,6 +423,55 @@ module Whisper
406
  def no_speech_prob: () -> Float
407
  end
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  class Error < StandardError
410
  attr_reader code: Integer
411
 
 
150
  ?encoder_begin_callback: encoder_begin_callback,
151
  ?encoder_begin_callback_user_data: Object,
152
  ?abort_callback: abort_callback,
153
+ ?abort_callback_user_data: Object,
154
+ ?vad: boolish,
155
+ ?vad_model_path: path | URI,
156
+ ?vad_params: Whisper::VAD::Params
157
  ) -> instance
158
 
159
  # params.language = "auto" | "en", etc...
 
341
 
342
  def abort_callback_user_data: () -> Object
343
 
344
+ # Enable VAD
345
+ #
346
+ def vad=: (boolish) -> boolish
347
+
348
+ def vad: () -> (true | false)
349
+
350
+ # Path to the VAD model
351
+ def vad_model_path=: (path | URI | nil) -> (path | URI | nil)
352
+
353
+ def vad_model_path: () -> (String | nil)
354
+
355
+ def vad_params=: (Whisper::VAD::Params) -> Whisper::VAD::Params
356
+ def vad_params: () -> (Whisper::VAD::Params)
357
+
358
  # Hook called on new segment. Yields each Whisper::Segment.
359
  #
360
  # whisper.on_new_segment do |segment|
 
423
  def no_speech_prob: () -> Float
424
  end
425
 
426
+ module VAD
427
+ class Params
428
+ def self.new: (
429
+ ?threshold: Float,
430
+ ?min_speech_duration_ms: Integer,
431
+ ?min_silence_duration_ms: Integer,
432
+ ?max_speech_duration_s: Float,
433
+ ?speech_pad_ms: Integer,
434
+ ?samples_overlap: Float
435
+ ) -> instance
436
+
437
+ # Probability threshold to consider as speech.
438
+ #
439
+ def threshold=: (Float) -> Float
440
+
441
+ def threshold: () -> Float
442
+
443
+ # Min duration for a valid speech segment.
444
+ #
445
+ def min_speech_duration_ms=: (Integer) -> Integer
446
+
447
+ def min_speech_duration_ms: () -> Integer
448
+
449
+ # Min silence duration to consider speech as ended.
450
+ #
451
+ def min_silence_duration_ms=: (Integer) -> Integer
452
+
453
+ def min_silence_duration_ms: () -> Integer
454
+
455
+ # Max duration of a speech segment before forcing a new segment.
456
+ def max_speech_duration_s=: (Float) -> Float
457
+
458
+ def max_speech_duration_s: () -> Float
459
+
460
+ # Padding added before and after speech segments.
461
+ #
462
+ def speech_pad_ms=: (Integer) -> Integer
463
+
464
+ def speech_pad_ms: () -> Integer
465
+
466
+ # Overlap in seconds when copying audio samples from speech segment.
467
+ #
468
+ def samples_overlap=: (Float) -> Float
469
+
470
+ def samples_overlap: () -> Float
471
+ def ==: (Params) -> (true | false)
472
+ end
473
+ end
474
+
475
  class Error < StandardError
476
  attr_reader code: Integer
477
 
bindings/ruby/tests/test_params.rb CHANGED
@@ -32,6 +32,9 @@ class TestParams < TestBase
32
  :progress_callback_user_data,
33
  :abort_callback,
34
  :abort_callback_user_data,
 
 
 
35
  ]
36
 
37
  def setup
@@ -191,6 +194,50 @@ class TestParams < TestBase
191
  assert_in_delta 0.2, @params.no_speech_thold
192
  end
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  def test_new_with_kw_args
195
  params = Whisper::Params.new(language: "es")
196
  assert_equal "es", params.language
@@ -225,6 +272,10 @@ class TestParams < TestBase
225
  proc {}
226
  in [/_user_data\Z/, *]
227
  Object.new
 
 
 
 
228
  end
229
  params = Whisper::Params.new(param => value)
230
  if Float === value
 
32
  :progress_callback_user_data,
33
  :abort_callback,
34
  :abort_callback_user_data,
35
+ :vad,
36
+ :vad_model_path,
37
+ :vad_params,
38
  ]
39
 
40
  def setup
 
194
  assert_in_delta 0.2, @params.no_speech_thold
195
  end
196
 
197
+ def test_vad
198
+ assert_false @params.vad
199
+ @params.vad = true
200
+ assert_true @params.vad
201
+ end
202
+
203
+ def test_vad_model_path
204
+ assert_nil @params.vad_model_path
205
+ @params.vad_model_path = "silero-v5.1.2"
206
+ assert_equal Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path, @params.vad_model_path
207
+ end
208
+
209
+ def test_vad_model_path_with_nil
210
+ @params.vad_model_path = "silero-v5.1.2"
211
+ @params.vad_model_path = nil
212
+ assert_nil @params.vad_model_path
213
+ end
214
+
215
+ def test_vad_model_path_with_invalid
216
+ assert_raise TypeError do
217
+ @params.vad_model_path = Object.new
218
+ end
219
+ end
220
+
221
+ def test_vad_model_path_with_URI_string
222
+ @params.vad_model_path = "https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin"
223
+ assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
224
+ end
225
+
226
+ def test_vad_model_path_with_URI
227
+ @params.vad_model_path = URI("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin")
228
+ assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
229
+ end
230
+
231
+ def test_vad_params
232
+ assert_kind_of Whisper::VAD::Params, @params.vad_params
233
+ default_params = @params.vad_params
234
+ assert_same default_params, @params.vad_params
235
+ assert_equal 0.5, default_params.threshold
236
+ new_params = Whisper::VAD::Params.new
237
+ @params.vad_params = new_params
238
+ assert_same new_params, @params.vad_params
239
+ end
240
+
241
  def test_new_with_kw_args
242
  params = Whisper::Params.new(language: "es")
243
  assert_equal "es", params.language
 
272
  proc {}
273
  in [/_user_data\Z/, *]
274
  Object.new
275
+ in [:vad_model_path, *]
276
+ Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
277
+ in [:vad_params, *]
278
+ Whisper::VAD::Params.new
279
  end
280
  params = Whisper::Params.new(param => value)
281
  if Float === value
bindings/ruby/tests/test_vad.rb ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ require_relative "helper"
2
+
3
+ class TestVAD < TestBase
4
+ def setup
5
+ @whisper = Whisper::Context.new("base.en")
6
+ vad_params = Whisper::VAD::Params.new
7
+ @params = Whisper::Params.new(
8
+ vad: true,
9
+ vad_model_path: "silero-v5.1.2",
10
+ vad_params:
11
+ )
12
+ end
13
+
14
+ def test_transcribe
15
+ @whisper.transcribe(TestBase::AUDIO, @params) do |text|
16
+ assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
17
+ end
18
+ end
19
+ end
bindings/ruby/tests/test_vad_params.rb ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ require_relative "helper"
2
+
3
+ class TestVADParams < TestBase
4
+ PARAM_NAMES = [
5
+ :threshold,
6
+ :min_speech_duration_ms,
7
+ :min_silence_duration_ms,
8
+ :max_speech_duration_s,
9
+ :speech_pad_ms,
10
+ :samples_overlap
11
+ ]
12
+
13
+ def setup
14
+ @params = Whisper::VAD::Params.new
15
+ end
16
+
17
+ def test_new
18
+ params = Whisper::VAD::Params.new
19
+ assert_kind_of Whisper::VAD::Params, params
20
+ end
21
+
22
+ def test_threshold
23
+ assert_in_delta @params.threshold, 0.5
24
+ @params.threshold = 0.7
25
+ assert_in_delta @params.threshold, 0.7
26
+ end
27
+
28
+ def test_min_speech_duration
29
+ pend
30
+ end
31
+
32
+ def test_min_speech_duration_ms
33
+ assert_equal 250, @params.min_speech_duration_ms
34
+ @params.min_speech_duration_ms = 500
35
+ assert_equal 500, @params.min_speech_duration_ms
36
+ end
37
+
38
+ def test_min_silence_duration_ms
39
+ assert_equal 100, @params.min_silence_duration_ms
40
+ @params.min_silence_duration_ms = 200
41
+ assert_equal 200, @params.min_silence_duration_ms
42
+ end
43
+
44
+ def test_max_speech_duration
45
+ pend
46
+ end
47
+
48
+ def test_max_speech_duration_s
49
+ assert @params.max_speech_duration_s >= 10e37 # Defaults to FLT_MAX
50
+ @params.max_speech_duration_s = 60.0
51
+ assert_equal 60.0, @params.max_speech_duration_s
52
+ end
53
+
54
+ def test_speech_pad_ms
55
+ assert_equal 30, @params.speech_pad_ms
56
+ @params.speech_pad_ms = 50
57
+ assert_equal 50, @params.speech_pad_ms
58
+ end
59
+
60
+ def test_samples_overlap
61
+ assert_in_delta @params.samples_overlap, 0.1
62
+ @params.samples_overlap = 0.5
63
+ assert_in_delta @params.samples_overlap, 0.5
64
+ end
65
+
66
+ def test_equal
67
+ assert_equal @params, Whisper::VAD::Params.new
68
+ end
69
+
70
+ def test_new_with_kw_args
71
+ params = Whisper::VAD::Params.new(threshold: 0.7)
72
+ assert_in_delta params.threshold, 0.7
73
+ assert_equal 250, params.min_speech_duration_ms
74
+ end
75
+
76
+ def test_new_with_kw_args_non_existent
77
+ assert_raise ArgumentError do
78
+ Whisper::VAD::Params.new(non_existent: "value")
79
+ end
80
+ end
81
+
82
+ data(PARAM_NAMES.collect {|param| [param, param]}.to_h)
83
+ def test_new_with_kw_args_default_values(param)
84
+ default_value = @params.send(param)
85
+ value = default_value + 1
86
+ params = Whisper::VAD::Params.new(param => value)
87
+ if Float === value
88
+ assert_in_delta value, params.send(param)
89
+ else
90
+ assert_equal value, params.send(param)
91
+ end
92
+
93
+ PARAM_NAMES.reject {|name| name == param}.each do |name|
94
+ expected = @params.send(name)
95
+ actual = params.send(name)
96
+ if Float === expected
97
+ assert_in_delta expected, actual
98
+ else
99
+ assert_equal expected, actual
100
+ end
101
+ end
102
+ end
103
+ end