danbev commited on
Commit
821d05f
·
unverified ·
1 Parent(s): 1374002

vad : store VAD context in whisper_state (#3156)

Browse files

* vad : store VAD context in whisper_state

This commit stores the VAD context in the whisper_state structure,
allowing for better management and reuse of the VAD context across
multiple calls to the whisper_vad function.

The motivation for this change is that when updating the stream example
I noticed that the VAD context was being re-initialized every time the
whisper_vad function was called. This involved loading the VAD model
which is expensive and unnecessary if the context can be reused.

Storing this in the whisper_state seems follow the pattern simliar to
how whisper_coreml_context and whisper_openvion_context are stored.

* vad : free vad_context in whisper_free_state

Files changed (1) hide show
  1. src/whisper.cpp +16 -5
src/whisper.cpp CHANGED
@@ -954,6 +954,8 @@ struct whisper_state {
954
  // [EXPERIMENTAL] speed-up techniques
955
  int32_t exp_n_audio_ctx = 0; // 0 - use default
956
 
 
 
957
  struct vad_segment_info {
958
  float orig_start;
959
  float orig_end;
@@ -3853,6 +3855,11 @@ void whisper_free_state(struct whisper_state * state) {
3853
  // [EXPERIMENTAL] Token-level timestamps with DTW
3854
  aheads_masks_free(state->aheads_masks);
3855
 
 
 
 
 
 
3856
  delete state;
3857
  }
3858
  }
@@ -6613,12 +6620,16 @@ static bool whisper_vad(
6613
  WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
6614
  filtered_n_samples = 0;
6615
 
6616
- struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
6617
- struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(params.vad_model_path, vad_ctx_params);
6618
- if (vctx == nullptr) {
6619
- WHISPER_LOG_ERROR("%s: failed to initialize VAD context\n", __func__);
6620
- return false;
 
 
 
6621
  }
 
6622
 
6623
  const whisper_vad_params & vad_params = params.vad_params;
6624
 
 
954
  // [EXPERIMENTAL] speed-up techniques
955
  int32_t exp_n_audio_ctx = 0; // 0 - use default
956
 
957
+ whisper_vad_context * vad_context = nullptr;
958
+
959
  struct vad_segment_info {
960
  float orig_start;
961
  float orig_end;
 
3855
  // [EXPERIMENTAL] Token-level timestamps with DTW
3856
  aheads_masks_free(state->aheads_masks);
3857
 
3858
+ if (state->vad_context != nullptr) {
3859
+ whisper_vad_free(state->vad_context);
3860
+ state->vad_context = nullptr;
3861
+ }
3862
+
3863
  delete state;
3864
  }
3865
  }
 
6620
  WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
6621
  filtered_n_samples = 0;
6622
 
6623
+ if (state->vad_context == nullptr) {
6624
+ struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
6625
+ struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(params.vad_model_path, vad_ctx_params);
6626
+ if (vctx == nullptr) {
6627
+ WHISPER_LOG_ERROR("%s: failed to initialize VAD context\n", __func__);
6628
+ return false;
6629
+ }
6630
+ state->vad_context = vctx;
6631
  }
6632
+ auto vctx = state->vad_context;
6633
 
6634
  const whisper_vad_params & vad_params = params.vad_params;
6635