Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

danbev commited on May 16

Commit

821d05f

unverified ·

1 Parent(s): 1374002

vad : store VAD context in whisper_state (#3156)

Browse files

* vad : store VAD context in whisper_state

This commit stores the VAD context in the whisper_state structure,
allowing for better management and reuse of the VAD context across
multiple calls to the whisper_vad function.

The motivation for this change is that when updating the stream example
I noticed that the VAD context was being re-initialized every time the
whisper_vad function was called. This involved loading the VAD model
which is expensive and unnecessary if the context can be reused.

Storing this in the whisper_state seems follow the pattern simliar to
how whisper_coreml_context and whisper_openvion_context are stored.

* vad : free vad_context in whisper_free_state

Files changed (1) hide show

src/whisper.cpp +16 -5

src/whisper.cpp CHANGED Viewed

@@ -954,6 +954,8 @@ struct whisper_state {
     // [EXPERIMENTAL] speed-up techniques
     int32_t exp_n_audio_ctx = 0; // 0 - use default
     struct vad_segment_info {
         float orig_start;
         float orig_end;
@@ -3853,6 +3855,11 @@ void whisper_free_state(struct whisper_state * state) {
         // [EXPERIMENTAL] Token-level timestamps with DTW
         aheads_masks_free(state->aheads_masks);
         delete state;
     }
 }
@@ -6613,12 +6620,16 @@ static bool whisper_vad(
     WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
     filtered_n_samples = 0;
-    struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
-    struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(params.vad_model_path, vad_ctx_params);
-    if (vctx == nullptr) {
-        WHISPER_LOG_ERROR("%s: failed to initialize VAD context\n", __func__);
-        return false;
     }
     const whisper_vad_params & vad_params = params.vad_params;

     // [EXPERIMENTAL] speed-up techniques
     int32_t exp_n_audio_ctx = 0; // 0 - use default
+    whisper_vad_context * vad_context = nullptr;
     struct vad_segment_info {
         float orig_start;
         float orig_end;
         // [EXPERIMENTAL] Token-level timestamps with DTW
         aheads_masks_free(state->aheads_masks);
+        if (state->vad_context != nullptr) {
+            whisper_vad_free(state->vad_context);
+            state->vad_context = nullptr;
+        }
         delete state;
     }
 }
     WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
     filtered_n_samples = 0;
+    if (state->vad_context == nullptr) {
+        struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
+        struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(params.vad_model_path, vad_ctx_params);
+        if (vctx == nullptr) {
+            WHISPER_LOG_ERROR("%s: failed to initialize VAD context\n", __func__);
+            return false;
+        }
+        state->vad_context = vctx;
     }
+    auto vctx = state->vad_context;
     const whisper_vad_params & vad_params = params.vad_params;