Spaces:
Running
Running
ruby : output format (#3237)
Browse files* Fix a typo
* Don't allocate output string unless needed
* Add methods to output SRT and WebVTT
* Add tests for output methods
* Make constants for output private
* Add signatures for output methods
* Add document on output methods
* Fix method name: Segment#speaker_next_turn? -> #speacker_turn_next?
* Add Whisper::Segment#descotruct_keys
* Add test for Whisper::Context#descotruct_keys
* Add signature of Whisper::Segment#deconstruct_keys
* Use parentheses to suppress warning
* Update date
- bindings/ruby/README.md +28 -2
- bindings/ruby/ext/ruby_whisper.c +2 -0
- bindings/ruby/ext/ruby_whisper_context.c +1 -1
- bindings/ruby/ext/ruby_whisper_segment.c +78 -1
- bindings/ruby/ext/ruby_whisper_transcribe.cpp +4 -3
- bindings/ruby/lib/whisper/context.rb +15 -0
- bindings/ruby/lib/whisper/segment.rb +58 -0
- bindings/ruby/sig/whisper.rbs +23 -1
- bindings/ruby/test/test_segment.rb +62 -0
- bindings/ruby/test/test_whisper.rb +45 -1
- bindings/ruby/whispercpp.gemspec +1 -1
bindings/ruby/README.md
CHANGED
|
@@ -162,6 +162,32 @@ Whisper::Params.new(
|
|
| 162 |
|
| 163 |
For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
API
|
| 166 |
---
|
| 167 |
|
|
@@ -196,7 +222,7 @@ whisper
|
|
| 196 |
ed: format_time(segment.end_time),
|
| 197 |
text: segment.text
|
| 198 |
}
|
| 199 |
-
line << " (speaker turned)" if segment.
|
| 200 |
puts line
|
| 201 |
end
|
| 202 |
|
|
@@ -212,7 +238,7 @@ params.on_new_segment do |segment|
|
|
| 212 |
ed: format_time(segment.end_time),
|
| 213 |
text: segment.text
|
| 214 |
}
|
| 215 |
-
line << " (speaker turned)" if segment.
|
| 216 |
puts line
|
| 217 |
end
|
| 218 |
|
|
|
|
| 162 |
|
| 163 |
For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
|
| 164 |
|
| 165 |
+
### Output ###
|
| 166 |
+
|
| 167 |
+
whispercpp supports SRT and WebVTT output:
|
| 168 |
+
|
| 169 |
+
```ruby
|
| 170 |
+
puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
|
| 171 |
+
# =>
|
| 172 |
+
WEBVTT
|
| 173 |
+
|
| 174 |
+
1
|
| 175 |
+
00:00:00.000 --> 00:00:03.860
|
| 176 |
+
My thought I have nobody by a beauty and will as you poured.
|
| 177 |
+
|
| 178 |
+
2
|
| 179 |
+
00:00:03.860 --> 00:00:09.840
|
| 180 |
+
Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
|
| 181 |
+
|
| 182 |
+
3
|
| 183 |
+
00:00:09.840 --> 00:00:09.940
|
| 184 |
+
a
|
| 185 |
+
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
You may call `#to_srt`, too
|
| 189 |
+
|
| 190 |
+
|
| 191 |
API
|
| 192 |
---
|
| 193 |
|
|
|
|
| 222 |
ed: format_time(segment.end_time),
|
| 223 |
text: segment.text
|
| 224 |
}
|
| 225 |
+
line << " (speaker turned)" if segment.speaker_turn_next?
|
| 226 |
puts line
|
| 227 |
end
|
| 228 |
|
|
|
|
| 238 |
ed: format_time(segment.end_time),
|
| 239 |
text: segment.text
|
| 240 |
}
|
| 241 |
+
line << " (speaker turned)" if segment.speaker_turn_next?
|
| 242 |
puts line
|
| 243 |
end
|
| 244 |
|
bindings/ruby/ext/ruby_whisper.c
CHANGED
|
@@ -170,5 +170,7 @@ void Init_whisper() {
|
|
| 170 |
init_ruby_whisper_model(&mWhisper);
|
| 171 |
init_ruby_whisper_vad_params(&mVAD);
|
| 172 |
|
|
|
|
|
|
|
| 173 |
rb_require("whisper/model/uri");
|
| 174 |
}
|
|
|
|
| 170 |
init_ruby_whisper_model(&mWhisper);
|
| 171 |
init_ruby_whisper_vad_params(&mVAD);
|
| 172 |
|
| 173 |
+
rb_require("whisper/context");
|
| 174 |
+
rb_require("whisper/segment");
|
| 175 |
rb_require("whisper/model/uri");
|
| 176 |
}
|
bindings/ruby/ext/ruby_whisper_context.c
CHANGED
|
@@ -664,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
|
| 664 |
rb_define_method(cContext, "full", ruby_whisper_full, -1);
|
| 665 |
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
|
| 666 |
|
| 667 |
-
// High
|
| 668 |
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
|
| 669 |
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
| 670 |
|
|
|
|
| 664 |
rb_define_method(cContext, "full", ruby_whisper_full, -1);
|
| 665 |
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
|
| 666 |
|
| 667 |
+
// High level
|
| 668 |
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
|
| 669 |
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
| 670 |
|
bindings/ruby/ext/ruby_whisper_segment.c
CHANGED
|
@@ -1,6 +1,15 @@
|
|
| 1 |
#include <ruby.h>
|
| 2 |
#include "ruby_whisper.h"
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
extern const rb_data_type_t ruby_whisper_type;
|
| 5 |
|
| 6 |
extern VALUE cSegment;
|
|
@@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
|
|
| 129 |
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
|
| 130 |
}
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
void
|
| 133 |
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
|
| 134 |
{
|
| 135 |
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
|
| 138 |
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
|
| 139 |
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
|
| 140 |
-
rb_define_method(cSegment, "
|
| 141 |
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
|
| 142 |
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
|
|
|
|
| 143 |
}
|
|
|
|
| 1 |
#include <ruby.h>
|
| 2 |
#include "ruby_whisper.h"
|
| 3 |
|
| 4 |
+
#define N_KEY_NAMES 5
|
| 5 |
+
|
| 6 |
+
static VALUE sym_start_time;
|
| 7 |
+
static VALUE sym_end_time;
|
| 8 |
+
static VALUE sym_text;
|
| 9 |
+
static VALUE sym_no_speech_prob;
|
| 10 |
+
static VALUE sym_speaker_turn_next;
|
| 11 |
+
static VALUE key_names;
|
| 12 |
+
|
| 13 |
extern const rb_data_type_t ruby_whisper_type;
|
| 14 |
|
| 15 |
extern VALUE cSegment;
|
|
|
|
| 138 |
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
|
| 139 |
}
|
| 140 |
|
| 141 |
+
/*
|
| 142 |
+
* call-seq:
|
| 143 |
+
* deconstruct_keys(keys) -> hash
|
| 144 |
+
*
|
| 145 |
+
* Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
|
| 146 |
+
*
|
| 147 |
+
* whisper.each_segment do |segment|
|
| 148 |
+
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
| 149 |
+
*
|
| 150 |
+
* puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
| 151 |
+
* end
|
| 152 |
+
*/
|
| 153 |
+
static VALUE
|
| 154 |
+
ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
|
| 155 |
+
{
|
| 156 |
+
ruby_whisper_segment *rws;
|
| 157 |
+
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
| 158 |
+
ruby_whisper *rw;
|
| 159 |
+
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
| 160 |
+
|
| 161 |
+
VALUE hash = rb_hash_new();
|
| 162 |
+
long n_keys;
|
| 163 |
+
if (NIL_P(keys)) {
|
| 164 |
+
keys = key_names;
|
| 165 |
+
n_keys = N_KEY_NAMES;
|
| 166 |
+
} else {
|
| 167 |
+
n_keys = RARRAY_LEN(keys);
|
| 168 |
+
if (n_keys > N_KEY_NAMES) {
|
| 169 |
+
return hash;
|
| 170 |
+
}
|
| 171 |
+
}
|
| 172 |
+
for (int i = 0; i < n_keys; i++) {
|
| 173 |
+
VALUE key = rb_ary_entry(keys, i);
|
| 174 |
+
if (key == sym_start_time) {
|
| 175 |
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
|
| 176 |
+
}
|
| 177 |
+
if (key == sym_end_time) {
|
| 178 |
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
|
| 179 |
+
}
|
| 180 |
+
if (key == sym_text) {
|
| 181 |
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
|
| 182 |
+
}
|
| 183 |
+
if (key == sym_no_speech_prob) {
|
| 184 |
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
|
| 185 |
+
}
|
| 186 |
+
if (key == sym_speaker_turn_next) {
|
| 187 |
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
return hash;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
void
|
| 195 |
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
|
| 196 |
{
|
| 197 |
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
|
| 198 |
|
| 199 |
+
sym_start_time = ID2SYM(rb_intern("start_time"));
|
| 200 |
+
sym_end_time = ID2SYM(rb_intern("end_time"));
|
| 201 |
+
sym_text = ID2SYM(rb_intern("text"));
|
| 202 |
+
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
|
| 203 |
+
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
|
| 204 |
+
key_names = rb_ary_new3(
|
| 205 |
+
N_KEY_NAMES,
|
| 206 |
+
sym_start_time,
|
| 207 |
+
sym_end_time,
|
| 208 |
+
sym_text,
|
| 209 |
+
sym_no_speech_prob,
|
| 210 |
+
sym_speaker_turn_next
|
| 211 |
+
);
|
| 212 |
+
|
| 213 |
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
|
| 214 |
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
|
| 215 |
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
|
| 216 |
+
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
| 217 |
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
|
| 218 |
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
|
| 219 |
+
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
|
| 220 |
}
|
bindings/ruby/ext/ruby_whisper_transcribe.cpp
CHANGED
|
@@ -76,15 +76,16 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|
| 76 |
fprintf(stderr, "failed to process audio\n");
|
| 77 |
return self;
|
| 78 |
}
|
|
|
|
|
|
|
|
|
|
| 79 |
const int n_segments = whisper_full_n_segments(rw->context);
|
| 80 |
VALUE output = rb_str_new2("");
|
| 81 |
for (int i = 0; i < n_segments; ++i) {
|
| 82 |
const char * text = whisper_full_get_segment_text(rw->context, i);
|
| 83 |
output = rb_str_concat(output, rb_str_new2(text));
|
| 84 |
}
|
| 85 |
-
|
| 86 |
-
rb_funcall(blk, id_call, 1, output);
|
| 87 |
-
}
|
| 88 |
return self;
|
| 89 |
}
|
| 90 |
#ifdef __cplusplus
|
|
|
|
| 76 |
fprintf(stderr, "failed to process audio\n");
|
| 77 |
return self;
|
| 78 |
}
|
| 79 |
+
if (NIL_P(blk)) {
|
| 80 |
+
return self;
|
| 81 |
+
}
|
| 82 |
const int n_segments = whisper_full_n_segments(rw->context);
|
| 83 |
VALUE output = rb_str_new2("");
|
| 84 |
for (int i = 0; i < n_segments; ++i) {
|
| 85 |
const char * text = whisper_full_get_segment_text(rw->context, i);
|
| 86 |
output = rb_str_concat(output, rb_str_new2(text));
|
| 87 |
}
|
| 88 |
+
rb_funcall(blk, id_call, 1, output);
|
|
|
|
|
|
|
| 89 |
return self;
|
| 90 |
}
|
| 91 |
#ifdef __cplusplus
|
bindings/ruby/lib/whisper/context.rb
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
module Whisper
|
| 2 |
+
class Context
|
| 3 |
+
def to_srt
|
| 4 |
+
each_segment.with_index.reduce("") {|srt, (segment, index)|
|
| 5 |
+
srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
|
| 6 |
+
}
|
| 7 |
+
end
|
| 8 |
+
|
| 9 |
+
def to_webvtt
|
| 10 |
+
each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
|
| 11 |
+
webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
|
| 12 |
+
}
|
| 13 |
+
end
|
| 14 |
+
end
|
| 15 |
+
end
|
bindings/ruby/lib/whisper/segment.rb
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
module Whisper
|
| 2 |
+
class Segment
|
| 3 |
+
SRT_ESCAPES = {
|
| 4 |
+
"&" => "&",
|
| 5 |
+
"<" => "<",
|
| 6 |
+
">" => ">",
|
| 7 |
+
}
|
| 8 |
+
SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
|
| 9 |
+
private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
|
| 10 |
+
|
| 11 |
+
def to_srt_cue
|
| 12 |
+
"#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
|
| 13 |
+
end
|
| 14 |
+
|
| 15 |
+
def to_webvtt_cue
|
| 16 |
+
"#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
|
| 17 |
+
end
|
| 18 |
+
|
| 19 |
+
private
|
| 20 |
+
|
| 21 |
+
def time_to_a(time)
|
| 22 |
+
sec, decimal_part = time.divmod(1000)
|
| 23 |
+
min, sec = sec.divmod(60)
|
| 24 |
+
hour, min = min.divmod(60)
|
| 25 |
+
[hour, min, sec, decimal_part]
|
| 26 |
+
end
|
| 27 |
+
|
| 28 |
+
def srt_time(time)
|
| 29 |
+
"%02d:%02d:%02d,%03d" % time_to_a(time)
|
| 30 |
+
end
|
| 31 |
+
|
| 32 |
+
def srt_start_time
|
| 33 |
+
srt_time(start_time)
|
| 34 |
+
end
|
| 35 |
+
|
| 36 |
+
def srt_end_time
|
| 37 |
+
srt_time(end_time)
|
| 38 |
+
end
|
| 39 |
+
|
| 40 |
+
def srt_text
|
| 41 |
+
text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
|
| 42 |
+
end
|
| 43 |
+
|
| 44 |
+
def webvtt_time(time)
|
| 45 |
+
"%02d:%02d:%02d.%03d" % time_to_a(time)
|
| 46 |
+
end
|
| 47 |
+
|
| 48 |
+
def webvtt_start_time
|
| 49 |
+
webvtt_time(start_time)
|
| 50 |
+
end
|
| 51 |
+
|
| 52 |
+
def webvtt_end_time
|
| 53 |
+
webvtt_time(end_time)
|
| 54 |
+
end
|
| 55 |
+
|
| 56 |
+
alias webvtt_text srt_text
|
| 57 |
+
end
|
| 58 |
+
end
|
bindings/ruby/sig/whisper.rbs
CHANGED
|
@@ -116,6 +116,9 @@ module Whisper
|
|
| 116 |
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
| 117 |
| (Params, _Samples, ?Integer n_samples) -> self
|
| 118 |
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
|
|
|
|
|
|
|
|
|
| 119 |
end
|
| 120 |
|
| 121 |
class Params
|
|
@@ -415,6 +418,14 @@ module Whisper
|
|
| 415 |
end
|
| 416 |
|
| 417 |
class Segment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
# Start time in milliseconds.
|
| 419 |
#
|
| 420 |
def start_time: () -> Integer
|
|
@@ -424,10 +435,21 @@ module Whisper
|
|
| 424 |
def end_time: () -> Integer
|
| 425 |
|
| 426 |
# Whether the next segment is predicted as a speaker turn.
|
| 427 |
-
def
|
| 428 |
|
| 429 |
def text: () -> String
|
| 430 |
def no_speech_prob: () -> Float
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
end
|
| 432 |
|
| 433 |
module VAD
|
|
|
|
| 116 |
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
| 117 |
| (Params, _Samples, ?Integer n_samples) -> self
|
| 118 |
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
| 119 |
+
|
| 120 |
+
def to_srt: () -> String
|
| 121 |
+
def to_webvtt: () -> String
|
| 122 |
end
|
| 123 |
|
| 124 |
class Params
|
|
|
|
| 418 |
end
|
| 419 |
|
| 420 |
class Segment
|
| 421 |
+
type deconstructed_keys = {
|
| 422 |
+
start_time: (Integer | nil),
|
| 423 |
+
end_time: (Integer | nil),
|
| 424 |
+
text: (String | nil),
|
| 425 |
+
no_speech_prob: (Float | nil),
|
| 426 |
+
speaker_turn_next: (true | false | nil)
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
# Start time in milliseconds.
|
| 430 |
#
|
| 431 |
def start_time: () -> Integer
|
|
|
|
| 435 |
def end_time: () -> Integer
|
| 436 |
|
| 437 |
# Whether the next segment is predicted as a speaker turn.
|
| 438 |
+
def speaker_turn_next?: () -> (true | false)
|
| 439 |
|
| 440 |
def text: () -> String
|
| 441 |
def no_speech_prob: () -> Float
|
| 442 |
+
def to_srt_cue: () -> String
|
| 443 |
+
def to_webvtt_cue: () -> String
|
| 444 |
+
|
| 445 |
+
# Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
|
| 446 |
+
#
|
| 447 |
+
# whisper.each_segment do |segment|
|
| 448 |
+
# segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
| 449 |
+
#
|
| 450 |
+
# puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
| 451 |
+
# end
|
| 452 |
+
def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
|
| 453 |
end
|
| 454 |
|
| 455 |
module VAD
|
bindings/ruby/test/test_segment.rb
CHANGED
|
@@ -71,4 +71,66 @@ class TestSegment < TestBase
|
|
| 71 |
end
|
| 72 |
whisper.transcribe(AUDIO, params)
|
| 73 |
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
end
|
|
|
|
| 71 |
end
|
| 72 |
whisper.transcribe(AUDIO, params)
|
| 73 |
end
|
| 74 |
+
|
| 75 |
+
def test_pattern_matching
|
| 76 |
+
segment = whisper.each_segment.first
|
| 77 |
+
segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
| 78 |
+
|
| 79 |
+
assert_equal segment.start_time, start_time
|
| 80 |
+
assert_equal segment.end_time, end_time
|
| 81 |
+
assert_equal segment.text, text
|
| 82 |
+
assert_equal segment.no_speech_prob, no_speech_prob
|
| 83 |
+
assert_equal segment.speaker_turn_next?, speaker_turn_next
|
| 84 |
+
end
|
| 85 |
+
|
| 86 |
+
def test_pattern_matching_partial
|
| 87 |
+
segment = whisper.each_segment.first
|
| 88 |
+
segment => {start_time:, end_time:, text:}
|
| 89 |
+
|
| 90 |
+
assert_equal segment.start_time, start_time
|
| 91 |
+
assert_equal segment.end_time, end_time
|
| 92 |
+
assert_equal segment.text, text
|
| 93 |
+
end
|
| 94 |
+
|
| 95 |
+
def test_deconstruct_keys
|
| 96 |
+
segment = whisper.each_segment.first
|
| 97 |
+
expected = {
|
| 98 |
+
start_time: segment.start_time,
|
| 99 |
+
end_time: segment.end_time,
|
| 100 |
+
text: segment.text,
|
| 101 |
+
no_speech_prob: segment.no_speech_prob,
|
| 102 |
+
speaker_turn_next: segment.speaker_turn_next?
|
| 103 |
+
}
|
| 104 |
+
assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
|
| 105 |
+
end
|
| 106 |
+
|
| 107 |
+
def test_deconstruct_keys_non_existent
|
| 108 |
+
omit "Undefined behavior"
|
| 109 |
+
|
| 110 |
+
segment = whisper.each_segment.first
|
| 111 |
+
|
| 112 |
+
assert_equal({}, segment.deconstruct_keys([:non_existent]))
|
| 113 |
+
end
|
| 114 |
+
|
| 115 |
+
def test_deconstruct_keys_too_many_keys
|
| 116 |
+
omit "Undefined behavior"
|
| 117 |
+
|
| 118 |
+
segment = whisper.each_segment.first
|
| 119 |
+
|
| 120 |
+
assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
|
| 121 |
+
end
|
| 122 |
+
|
| 123 |
+
def test_deconstruct_keys_includes_non_existent_keys_not_too_many
|
| 124 |
+
omit "Undefined behavior"
|
| 125 |
+
|
| 126 |
+
segment = whisper.each_segment.first
|
| 127 |
+
|
| 128 |
+
expected = {
|
| 129 |
+
start_time: segment.start_time,
|
| 130 |
+
end_time: segment.end_time,
|
| 131 |
+
text: segment.text,
|
| 132 |
+
no_speech_prob: segment.no_speech_prob
|
| 133 |
+
}
|
| 134 |
+
assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
|
| 135 |
+
end
|
| 136 |
end
|
bindings/ruby/test/test_whisper.rb
CHANGED
|
@@ -113,7 +113,7 @@ class TestWhisper < TestBase
|
|
| 113 |
end
|
| 114 |
|
| 115 |
def test_system_info_str
|
| 116 |
-
assert_match
|
| 117 |
end
|
| 118 |
|
| 119 |
def test_log_set
|
|
@@ -245,4 +245,48 @@ class TestWhisper < TestBase
|
|
| 245 |
assert_match(/for your country/i, text)
|
| 246 |
end
|
| 247 |
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
end
|
|
|
|
| 113 |
end
|
| 114 |
|
| 115 |
def test_system_info_str
|
| 116 |
+
assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
|
| 117 |
end
|
| 118 |
|
| 119 |
def test_log_set
|
|
|
|
| 245 |
assert_match(/for your country/i, text)
|
| 246 |
end
|
| 247 |
end
|
| 248 |
+
|
| 249 |
+
def test_to_srt
|
| 250 |
+
whisper = Whisper::Context.new("base.en")
|
| 251 |
+
whisper.transcribe AUDIO, @params
|
| 252 |
+
|
| 253 |
+
lines = whisper.to_srt.lines
|
| 254 |
+
assert_match(/\A\d+\n/, lines[0])
|
| 255 |
+
assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
|
| 256 |
+
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
|
| 257 |
+
end
|
| 258 |
+
|
| 259 |
+
def test_to_webvtt
|
| 260 |
+
whisper = Whisper::Context.new("base.en")
|
| 261 |
+
whisper.transcribe AUDIO, @params
|
| 262 |
+
|
| 263 |
+
lines = whisper.to_webvtt.lines
|
| 264 |
+
assert_equal "WEBVTT\n", lines[0]
|
| 265 |
+
assert_equal "\n", lines[1]
|
| 266 |
+
assert_match(/\A\d+\n/, lines[2])
|
| 267 |
+
assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
|
| 268 |
+
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
|
| 269 |
+
end
|
| 270 |
+
|
| 271 |
+
sub_test_case "Format needs escape" do
|
| 272 |
+
def setup
|
| 273 |
+
@whisper = Whisper::Context.new("base.en")
|
| 274 |
+
@whisper.transcribe AUDIO, Whisper::Params.new
|
| 275 |
+
segment = @whisper.each_segment.first
|
| 276 |
+
segment.define_singleton_method :text do
|
| 277 |
+
"& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
|
| 278 |
+
end
|
| 279 |
+
@whisper.define_singleton_method :each_segment do
|
| 280 |
+
Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
|
| 281 |
+
end
|
| 282 |
+
end
|
| 283 |
+
|
| 284 |
+
def test_to_srt_escape
|
| 285 |
+
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
|
| 286 |
+
end
|
| 287 |
+
|
| 288 |
+
def test_to_webvtt_escape
|
| 289 |
+
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
|
| 290 |
+
end
|
| 291 |
+
end
|
| 292 |
end
|
bindings/ruby/whispercpp.gemspec
CHANGED
|
@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
|
|
| 4 |
s.name = "whispercpp"
|
| 5 |
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
| 6 |
s.version = '1.3.3'
|
| 7 |
-
s.date = '2025-06-
|
| 8 |
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
| 9 |
s.email = '[email protected]'
|
| 10 |
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
|
|
|
| 4 |
s.name = "whispercpp"
|
| 5 |
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
| 6 |
s.version = '1.3.3'
|
| 7 |
+
s.date = '2025-06-10'
|
| 8 |
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
| 9 |
s.email = '[email protected]'
|
| 10 |
s.extra_rdoc_files = ['LICENSE', 'README.md']
|