Spaces:

natasa365
/

whisper.cpp

Sleeping

App Files Files Community

whisper.cpp / bindings /ruby /sig /whisper.rbs

KitaitiMakoto

ruby : add encoder begin callback related methods (#3076)

855927b unverified 8 months ago

raw

history blame

13 kB

	module Whisper
	interface _Samples
	def length: () -> Integer
	def each: { (Float) -> void } -> void
	end

	type log_callback = ^(Integer level, String message, Object user_data) -> void
	type new_segment_callback = ^(Whisper::Context, void, Integer n_new, Object user_data) -> void
	type progress_callback = ^(Whisper::Context, void, Integer progress, Object user_data) -> void
	type encoder_begin_callback = ^(Whisper::Context, void, Object user_data) -> void
	type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish

	LOG_LEVEL_NONE: Integer
	LOG_LEVEL_INFO: Integer
	LOG_LEVEL_WARN: Integer
	LOG_LEVEL_ERROR: Integer
	LOG_LEVEL_DEBUG: Integer
	LOG_LEVEL_CONT: Integer

	def self.lang_max_id: () -> Integer
	def self.lang_id: (string name) -> Integer
	def self.lang_str: (Integer id) -> String
	def self.lang_str_full: (Integer id) -> String
	def self.log_set: (log_callback, Object? user_data) -> log_callback

	class Context
	def self.new: (path \| ::URI::HTTP) -> instance

	# transcribe a single file
	# can emit to a block results
	#
	# params = Whisper::Params.new
	# params.duration = 60_000
	# whisper.transcribe "path/to/audio.wav", params do \|text\|
	# puts text
	# end
	#
	def transcribe: (string, Params) -> self
	\| (string, Params) { (String) -> void } -> self

	def model_n_vocab: () -> Integer
	def model_n_audio_ctx: () -> Integer
	def model_n_audio_state: () -> Integer
	def model_n_text_head: () -> Integer
	def model_n_text_layer: () -> Integer
	def model_n_mels: () -> Integer
	def model_ftype: () -> Integer
	def model_type: () -> String

	# Yields each Whisper::Segment:
	#
	# whisper.transcribe("path/to/audio.wav", params)
	# whisper.each_segment do \|segment\|
	# puts segment.text
	# end
	#
	# Returns an Enumerator if no block given:
	#
	# whisper.transcribe("path/to/audio.wav", params)
	# enum = whisper.each_segment
	# enum.to_a # => [#<Whisper::Segment>, ...]
	#
	def each_segment: { (Segment) -> void } -> void
	\| () -> Enumerator[Segment]

	def model: () -> Model
	def full_get_segment: (Integer nth) -> Segment
	def full_n_segments: () -> Integer

	# Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
	#
	def full_lang_id: () -> Integer

	# Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
	#
	# full_get_segment_t0(3) # => 1668 (16680 ms)
	#
	def full_get_segment_t0: (Integer) -> Integer

	# End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
	#
	# full_get_segment_t1(3) # => 1668 (16680 ms)
	#
	def full_get_segment_t1: (Integer) -> Integer

	# Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
	#
	# full_get_segment_speacker_turn_next(3) # => true
	#
	def full_get_segment_speaker_turn_next: (Integer) -> (true \| false)

	# Text of a segment indexed by +segment_index+.
	#
	# full_get_segment_text(3) # => "ask not what your country can do for you, ..."
	#
	def full_get_segment_text: (Integer) -> String

	def full_get_segment_no_speech_prob: (Integer) -> Float

	# Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
	# Not thread safe for same context
	# Uses the specified decoding strategy to obtain the text.
	#
	# The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
	#
	def full: (Params, Array[Float] samples, ?Integer n_samples) -> self
	\| (Params, _Samples, ?Integer n_samples) -> self

	# Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
	# Result is stored in the default state of the context
	# Not thread safe if executed in parallel on the same context.
	# It seems this approach can offer some speedup in some cases.
	# However, the transcription accuracy can be worse at the beginning and end of each chunk.
	#
	def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
	\| (Params, _Samples, ?Integer n_samples) -> self
	\| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
	end

	class Params
	def self.new: (
	?language: string,
	?translate: boolish,
	?no_context: boolish,
	?single_segment: boolish,
	?print_special: boolish,
	?print_progress: boolish,
	?print_realtime: boolish,
	?print_timestamps: boolish,
	?suppress_blank: boolish,
	?suppress_nst: boolish,
	?token_timestamps: boolish,
	?split_on_word: boolish,
	?initial_prompt: string \| nil,
	?diarize: boolish,
	?offset: Integer,
	?duration: Integer,
	?max_text_tokens: Integer,
	?temperature: Float,
	?max_initial_ts: Float,
	?length_penalty: Float,
	?temperature_inc: Float,
	?entropy_thold: Float,
	?logprob_thold: Float,
	?no_speech_thold: Float,
	?new_segment_callback: new_segment_callback,
	?new_segment_callback_user_data: Object,
	?progress_callback: progress_callback,
	?progress_callback_user_data: Object,
	?encoder_begin_callback: encoder_begin_callback,
	?encoder_begin_callback_user_data: Object,
	?abort_callback: abort_callback,
	?abort_callback_user_data: Object
	) -> instance

	# params.language = "auto" \| "en", etc...
	#
	def language=: (String) -> String # TODO: Enumerate lang names

	def language: () -> String
	def translate=: (boolish) -> boolish
	def translate: () -> (true \| false)
	def no_context=: (boolish) -> boolish

	# If true, does not use past transcription (if any) as initial prompt for the decoder.
	#
	def no_context: () -> (true \| false)

	def single_segment=: (boolish) -> boolish

	# If true, forces single segment output (useful for streaming).
	#
	def single_segment: () -> (true \| false)

	def print_special=: (boolish) -> boolish

	# If true, prints special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.).
	#
	def print_special: () -> (true \| false)

	def print_progress=: (boolish) -> boolish

	# If true, prints progress information.
	#
	def print_progress: () -> (true \| false)

	def print_realtime=: (boolish) -> boolish

	# If true, prints results from within whisper.cpp. (avoid it, use callback instead)
	#
	def print_realtime: () -> (true \| false)

	# If true, prints timestamps for each text segment when printing realtime.
	#
	def print_timestamps=: (boolish) -> boolish

	def print_timestamps: () -> (true \| false)

	def suppress_blank=: (boolish) -> boolish

	# If true, suppresses blank outputs.
	#
	def suppress_blank: () -> (true \| false)

	def suppress_nst=: (boolish) -> boolish

	# If true, suppresses non-speech-tokens.
	#
	def suppress_nst: () -> (true \| false)

	def token_timestamps=: (boolish) -> boolish

	# If true, enables token-level timestamps.
	#
	def token_timestamps: () -> (true \| false)

	def split_on_word=: (boolish) -> boolish

	# If true, split on word rather than on token (when used with max_len).
	#
	def split_on_word: () -> (true \| false)

	def initial_prompt=: (_ToS) -> _ToS

	# Tokens to provide to the whisper decoder as initial prompt
	# these are prepended to any existing text context from a previous call
	# use whisper_tokenize() to convert text to tokens.
	# Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
	#
	def initial_prompt: () -> (String \| nil)

	def diarize=: (boolish) -> boolish

	# If true, enables diarization.
	#
	def diarize: () -> (true \| false)

	def offset=: (Integer) -> Integer

	# Start offset in ms.
	#
	def offset: () -> Integer

	def duration=: (Integer) -> Integer

	# Audio duration to process in ms.
	#
	def duration: () -> Integer

	def max_text_tokens=: (Integer) -> Integer

	# Max tokens to use from past text as prompt for the decoder.
	#
	def max_text_tokens: () -> Integer

	def temperature=: (Float) -> Float
	def temperature: () -> Float
	def max_initial_ts=: (Float) -> Float

	# See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
	#
	def max_initial_ts: () -> Float

	def length_penalty=: (Float) -> Float
	def length_penalty: () -> Float
	def temperature_inc=: (Float) -> Float
	def temperature_inc: () -> Float
	def entropy_thold=: (Float) -> Float

	# Similar to OpenAI's "compression_ratio_threshold"
	#
	def entropy_thold: () -> Float

	def logprob_thold=: (Float) -> Float
	def logprob_thold: () -> Float
	def no_speech_thold=: (Float) -> Float
	def no_speech_thold: () -> Float

	# Sets new segment callback, called for every newly generated text segment.
	#
	# params.new_segment_callback = ->(context, _, n_new, user_data) {
	# # ...
	# }
	#
	def new_segment_callback=: (new_segment_callback) -> new_segment_callback
	def new_segment_callback: () -> (new_segment_callback \| nil)

	# Sets user data passed to the last argument of new segment callback.
	#
	def new_segment_callback_user_data=: (Object) -> Object

	def new_segment_callback_user_data: () -> Object

	# Sets progress callback, called on each progress update.
	#
	# params.new_segment_callback = ->(context, _, progress, user_data) {
	# # ...
	# }
	#
	# +progress+ is an Integer between 0 and 100.
	#
	def progress_callback=: (progress_callback) -> progress_callback

	def progress_callback: () -> (progress_callback \| nil)

	# Sets user data passed to the last argument of progress callback.
	#
	def progress_callback_user_data=: (Object) -> Object

	def progress_callback_user_data: () -> Object

	# Sets encoder begin callback, called when the encoder starts.
	#
	def encoder_begin_callback=: (encoder_begin_callback) -> encoder_begin_callback

	def encoder_begin_callback: () -> (encoder_begin_callback \| nil)

	# Sets user data passed to the last argument of encoder begin callback.
	#
	def encoder_begin_callback_user_data=: (Object) -> Object

	def encoder_begin_callback_user_data: () -> Object

	# Sets abort callback, called to check if the process should be aborted.
	#
	# params.abort_callback = ->(user_data) {
	# # ...
	# }
	#
	#
	def abort_callback=: (abort_callback) -> abort_callback

	def abort_callback: () -> (abort_callback \| nil)

	# Sets user data passed to the last argument of abort callback.
	#
	def abort_callback_user_data=: (Object) -> Object

	def abort_callback_user_data: () -> Object

	# Hook called on new segment. Yields each Whisper::Segment.
	#
	# whisper.on_new_segment do \|segment\|
	# # ...
	# end
	#
	def on_new_segment: { (Segment) -> void } -> void

	# Hook called on progress update. Yields each progress Integer between 0 and 100.
	#
	def on_progress: { (Integer progress) -> void } -> void

	# Hook called on encoder starts.
	#
	def on_encoder_begin: { () -> void } -> void

	# Call block to determine whether abort or not. Return +true+ when you want to abort.
	#
	# params.abort_on do
	# if some_condition
	# true # abort
	# else
	# false # continue
	# end
	# end
	#
	def abort_on: { (Object user_data) -> boolish } -> void
	end

	class Model
	def self.pre_converted_models: () -> Hash[String, Model::URI]
	def self.new: () -> instance
	def n_vocab: () -> Integer
	def n_audio_ctx: () -> Integer
	def n_audio_state: () -> Integer
	def n_audio_head: () -> Integer
	def n_audio_layer: () -> Integer
	def n_text_ctx: () -> Integer
	def n_text_state: () -> Integer
	def n_text_head: () -> Integer
	def n_text_layer: () -> Integer
	def n_mels: () -> Integer
	def ftype: () -> Integer
	def type: () -> String

	class URI
	def self.new: (string \| ::URI::HTTP) -> instance
	def to_path: -> String
	def clear_cache: -> void
	end
	end

	class Segment
	# Start time in milliseconds.
	#
	def start_time: () -> Integer

	# End time in milliseconds.
	#
	def end_time: () -> Integer

	# Whether the next segment is predicted as a speaker turn.
	def speaker_next_turn?: () -> (true \| false)

	def text: () -> String
	def no_speech_prob: () -> Float
	end

	class Error < StandardError
	attr_reader code: Integer

	def self.new: (Integer code) -> instance
	end
	end