Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

KitaitiMakoto commited on Jun 10

Commit

63cab25

unverified ·

1 Parent(s): 2c4b2dd

ruby : output format (#3237)

Browse files

* Fix a typo

* Don't allocate output string unless needed

* Add methods to output SRT and WebVTT

* Add tests for output methods

* Make constants for output private

* Add signatures for output methods

* Add document on output methods

* Fix method name: Segment#speaker_next_turn? -> #speacker_turn_next?

* Add Whisper::Segment#descotruct_keys

* Add test for Whisper::Context#descotruct_keys

* Add signature of Whisper::Segment#deconstruct_keys

* Use parentheses to suppress warning

* Update date

Files changed (11) hide show

bindings/ruby/README.md +28 -2
bindings/ruby/ext/ruby_whisper.c +2 -0
bindings/ruby/ext/ruby_whisper_context.c +1 -1
bindings/ruby/ext/ruby_whisper_segment.c +78 -1
bindings/ruby/ext/ruby_whisper_transcribe.cpp +4 -3
bindings/ruby/lib/whisper/context.rb +15 -0
bindings/ruby/lib/whisper/segment.rb +58 -0
bindings/ruby/sig/whisper.rbs +23 -1
bindings/ruby/test/test_segment.rb +62 -0
bindings/ruby/test/test_whisper.rb +45 -1
bindings/ruby/whispercpp.gemspec +1 -1

bindings/ruby/README.md CHANGED Viewed

@@ -162,6 +162,32 @@ Whisper::Params.new(
 For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
 API
 ---
@@ -196,7 +222,7 @@ whisper
       ed: format_time(segment.end_time),
       text: segment.text
     }
-    line << " (speaker turned)" if segment.speaker_next_turn?
     puts line
   end
@@ -212,7 +238,7 @@ params.on_new_segment do |segment|
     ed: format_time(segment.end_time),
     text: segment.text
   }
-  line << " (speaker turned)" if segment.speaker_next_turn?
   puts line
 end

 For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
+### Output ###
+whispercpp supports SRT and WebVTT output:
+```ruby
+puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
+# =>
+WEBVTT
+1
+00:00:00.000 --> 00:00:03.860
+ My thought I have nobody by a beauty and will as you poured.
+2
+00:00:03.860 --> 00:00:09.840
+ Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
+3
+00:00:09.840 --> 00:00:09.940
+ a
+```
+You may call `#to_srt`, too
 API
 ---
       ed: format_time(segment.end_time),
       text: segment.text
     }
+    line << " (speaker turned)" if segment.speaker_turn_next?
     puts line
   end
     ed: format_time(segment.end_time),
     text: segment.text
   }
+  line << " (speaker turned)" if segment.speaker_turn_next?
   puts line
 end

bindings/ruby/ext/ruby_whisper.c CHANGED Viewed

@@ -170,5 +170,7 @@ void Init_whisper() {
   init_ruby_whisper_model(&mWhisper);
   init_ruby_whisper_vad_params(&mVAD);
   rb_require("whisper/model/uri");
 }

   init_ruby_whisper_model(&mWhisper);
   init_ruby_whisper_vad_params(&mVAD);
+  rb_require("whisper/context");
+  rb_require("whisper/segment");
   rb_require("whisper/model/uri");
 }

bindings/ruby/ext/ruby_whisper_context.c CHANGED Viewed

@@ -664,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
   rb_define_method(cContext, "full", ruby_whisper_full, -1);
   rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
-  // High leve
   rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
   rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);

   rb_define_method(cContext, "full", ruby_whisper_full, -1);
   rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
+  // High level
   rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
   rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);

bindings/ruby/ext/ruby_whisper_segment.c CHANGED Viewed

@@ -1,6 +1,15 @@
 #include <ruby.h>
 #include "ruby_whisper.h"
 extern const rb_data_type_t ruby_whisper_type;
 extern VALUE cSegment;
@@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
   return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
 }
 void
 init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
 {
   cSegment  = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
   rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
   rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
   rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
-  rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
   rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
   rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
 }

 #include <ruby.h>
 #include "ruby_whisper.h"
+#define N_KEY_NAMES 5
+static VALUE sym_start_time;
+static VALUE sym_end_time;
+static VALUE sym_text;
+static VALUE sym_no_speech_prob;
+static VALUE sym_speaker_turn_next;
+static VALUE key_names;
 extern const rb_data_type_t ruby_whisper_type;
 extern VALUE cSegment;
   return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
 }
+/*
+ * call-seq:
+ *   deconstruct_keys(keys) -> hash
+ *
+ *  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+ *
+ *   whisper.each_segment do |segment|
+ *     segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+ *
+ *     puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+ *   end
+ */
+static VALUE
+ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
+{
+  ruby_whisper_segment *rws;
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  ruby_whisper *rw;
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  VALUE hash = rb_hash_new();
+  long n_keys;
+  if (NIL_P(keys)) {
+    keys = key_names;
+    n_keys = N_KEY_NAMES;
+  } else {
+    n_keys = RARRAY_LEN(keys);
+    if (n_keys > N_KEY_NAMES) {
+      return hash;
+    }
+  }
+  for (int i = 0; i < n_keys; i++) {
+    VALUE key = rb_ary_entry(keys, i);
+    if (key == sym_start_time) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
+    }
+    if (key == sym_end_time) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
+    }
+    if (key == sym_text) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
+    }
+    if (key == sym_no_speech_prob) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
+    }
+    if (key == sym_speaker_turn_next) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
+    }
+  }
+  return hash;
+}
 void
 init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
 {
   cSegment  = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
+  sym_start_time = ID2SYM(rb_intern("start_time"));
+  sym_end_time = ID2SYM(rb_intern("end_time"));
+  sym_text = ID2SYM(rb_intern("text"));
+  sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
+  sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
+  key_names = rb_ary_new3(
+    N_KEY_NAMES,
+    sym_start_time,
+    sym_end_time,
+    sym_text,
+    sym_no_speech_prob,
+    sym_speaker_turn_next
+  );
   rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
   rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
   rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
+  rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
   rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
   rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
+  rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
 }

bindings/ruby/ext/ruby_whisper_transcribe.cpp CHANGED Viewed

@@ -76,15 +76,16 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
     fprintf(stderr, "failed to process audio\n");
     return self;
   }
   const int n_segments = whisper_full_n_segments(rw->context);
   VALUE output = rb_str_new2("");
   for (int i = 0; i < n_segments; ++i) {
     const char * text = whisper_full_get_segment_text(rw->context, i);
     output = rb_str_concat(output, rb_str_new2(text));
   }
-  if (blk != Qnil) {
-    rb_funcall(blk, id_call, 1, output);
-  }
   return self;
 }
 #ifdef __cplusplus

     fprintf(stderr, "failed to process audio\n");
     return self;
   }
+  if (NIL_P(blk)) {
+    return self;
+  }
   const int n_segments = whisper_full_n_segments(rw->context);
   VALUE output = rb_str_new2("");
   for (int i = 0; i < n_segments; ++i) {
     const char * text = whisper_full_get_segment_text(rw->context, i);
     output = rb_str_concat(output, rb_str_new2(text));
   }
+  rb_funcall(blk, id_call, 1, output);
   return self;
 }
 #ifdef __cplusplus

bindings/ruby/lib/whisper/context.rb ADDED Viewed

	@@ -0,0 +1,15 @@

+module Whisper
+  class Context
+    def to_srt
+      each_segment.with_index.reduce("") {|srt, (segment, index)|
+        srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
+      }
+    end
+    def to_webvtt
+      each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
+        webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
+      }
+    end
+  end
+end

bindings/ruby/lib/whisper/segment.rb ADDED Viewed

	@@ -0,0 +1,58 @@

+module Whisper
+  class Segment
+    SRT_ESCAPES = {
+      "&" => "&amp;",
+      "<" => "&lt;",
+      ">" => "&gt;",
+    }
+    SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
+    private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
+    def to_srt_cue
+      "#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
+    end
+    def to_webvtt_cue
+      "#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
+    end
+    private
+    def time_to_a(time)
+      sec, decimal_part = time.divmod(1000)
+      min, sec = sec.divmod(60)
+      hour, min = min.divmod(60)
+      [hour, min, sec, decimal_part]
+    end
+    def srt_time(time)
+      "%02d:%02d:%02d,%03d" % time_to_a(time)
+    end
+    def srt_start_time
+      srt_time(start_time)
+    end
+    def srt_end_time
+      srt_time(end_time)
+    end
+    def srt_text
+      text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
+    end
+    def webvtt_time(time)
+      "%02d:%02d:%02d.%03d" % time_to_a(time)
+    end
+    def webvtt_start_time
+      webvtt_time(start_time)
+    end
+    def webvtt_end_time
+      webvtt_time(end_time)
+    end
+    alias webvtt_text srt_text
+  end
+end

bindings/ruby/sig/whisper.rbs CHANGED Viewed

@@ -116,6 +116,9 @@ module Whisper
     def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
   end
   class Params
@@ -415,6 +418,14 @@ module Whisper
   end
   class Segment
     # Start time in milliseconds.
     #
     def start_time: () -> Integer
@@ -424,10 +435,21 @@ module Whisper
     def end_time: () -> Integer
     # Whether the next segment is predicted as a speaker turn.
-    def speaker_next_turn?: () -> (true | false)
     def text: () -> String
     def no_speech_prob: () -> Float
   end
   module VAD

     def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
+    def to_srt: () -> String
+    def to_webvtt: () -> String
   end
   class Params
   end
   class Segment
+    type deconstructed_keys = {
+      start_time: (Integer | nil),
+      end_time: (Integer | nil),
+      text: (String | nil),
+      no_speech_prob: (Float | nil),
+      speaker_turn_next: (true | false | nil)
+    }
     # Start time in milliseconds.
     #
     def start_time: () -> Integer
     def end_time: () -> Integer
     # Whether the next segment is predicted as a speaker turn.
+    def speaker_turn_next?: () -> (true | false)
     def text: () -> String
     def no_speech_prob: () -> Float
+    def to_srt_cue: () -> String
+    def to_webvtt_cue: () -> String
+    #  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+    #
+    #      whisper.each_segment do |segment|
+    #        segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+    #
+    #        puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+    #      end
+    def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
   end
   module VAD

bindings/ruby/test/test_segment.rb CHANGED Viewed

@@ -71,4 +71,66 @@ class TestSegment < TestBase
     end
     whisper.transcribe(AUDIO, params)
   end
 end

     end
     whisper.transcribe(AUDIO, params)
   end
+  def test_pattern_matching
+    segment = whisper.each_segment.first
+    segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+    assert_equal segment.start_time, start_time
+    assert_equal segment.end_time, end_time
+    assert_equal segment.text, text
+    assert_equal segment.no_speech_prob, no_speech_prob
+    assert_equal segment.speaker_turn_next?, speaker_turn_next
+  end
+  def test_pattern_matching_partial
+    segment = whisper.each_segment.first
+    segment => {start_time:, end_time:, text:}
+    assert_equal segment.start_time, start_time
+    assert_equal segment.end_time, end_time
+    assert_equal segment.text, text
+  end
+  def test_deconstruct_keys
+    segment = whisper.each_segment.first
+    expected = {
+      start_time: segment.start_time,
+      end_time: segment.end_time,
+      text: segment.text,
+      no_speech_prob: segment.no_speech_prob,
+      speaker_turn_next: segment.speaker_turn_next?
+    }
+    assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
+  end
+  def test_deconstruct_keys_non_existent
+    omit "Undefined behavior"
+    segment = whisper.each_segment.first
+    assert_equal({}, segment.deconstruct_keys([:non_existent]))
+  end
+  def test_deconstruct_keys_too_many_keys
+    omit "Undefined behavior"
+    segment = whisper.each_segment.first
+    assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
+  end
+  def test_deconstruct_keys_includes_non_existent_keys_not_too_many
+    omit "Undefined behavior"
+    segment = whisper.each_segment.first
+    expected = {
+      start_time: segment.start_time,
+      end_time: segment.end_time,
+      text: segment.text,
+      no_speech_prob: segment.no_speech_prob
+    }
+    assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
+  end
 end

bindings/ruby/test/test_whisper.rb CHANGED Viewed

@@ -113,7 +113,7 @@ class TestWhisper < TestBase
   end
   def test_system_info_str
-    assert_match /\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str
   end
   def test_log_set
@@ -245,4 +245,48 @@ class TestWhisper < TestBase
       assert_match(/for your country/i, text)
     end
   end
 end

   end
   def test_system_info_str
+    assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
   end
   def test_log_set
       assert_match(/for your country/i, text)
     end
   end
+  def test_to_srt
+    whisper = Whisper::Context.new("base.en")
+    whisper.transcribe AUDIO, @params
+    lines = whisper.to_srt.lines
+    assert_match(/\A\d+\n/, lines[0])
+    assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
+  end
+  def test_to_webvtt
+    whisper = Whisper::Context.new("base.en")
+    whisper.transcribe AUDIO, @params
+    lines = whisper.to_webvtt.lines
+    assert_equal "WEBVTT\n", lines[0]
+    assert_equal "\n", lines[1]
+    assert_match(/\A\d+\n/, lines[2])
+    assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
+  end
+  sub_test_case "Format needs escape" do
+    def setup
+      @whisper = Whisper::Context.new("base.en")
+      @whisper.transcribe AUDIO, Whisper::Params.new
+      segment = @whisper.each_segment.first
+      segment.define_singleton_method :text do
+        "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
+      end
+      @whisper.define_singleton_method :each_segment do
+        Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
+      end
+    end
+    def test_to_srt_escape
+      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
+    end
+    def test_to_webvtt_escape
+      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
+    end
+  end
 end

bindings/ruby/whispercpp.gemspec CHANGED Viewed

@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
   s.name    = "whispercpp"
   s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
   s.version = '1.3.3'
-  s.date    = '2025-06-03'
   s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
   s.email   = '[email protected]'
   s.extra_rdoc_files = ['LICENSE', 'README.md']

   s.name    = "whispercpp"
   s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
   s.version = '1.3.3'
+  s.date    = '2025-06-10'
   s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
   s.email   = '[email protected]'
   s.extra_rdoc_files = ['LICENSE', 'README.md']