Spaces:
Running
Running
vad : store VAD context in whisper_state (#3156)
Browse files* vad : store VAD context in whisper_state
This commit stores the VAD context in the whisper_state structure,
allowing for better management and reuse of the VAD context across
multiple calls to the whisper_vad function.
The motivation for this change is that when updating the stream example
I noticed that the VAD context was being re-initialized every time the
whisper_vad function was called. This involved loading the VAD model
which is expensive and unnecessary if the context can be reused.
Storing this in the whisper_state seems follow the pattern simliar to
how whisper_coreml_context and whisper_openvion_context are stored.
* vad : free vad_context in whisper_free_state
- src/whisper.cpp +16 -5
src/whisper.cpp
CHANGED
|
@@ -954,6 +954,8 @@ struct whisper_state {
|
|
| 954 |
// [EXPERIMENTAL] speed-up techniques
|
| 955 |
int32_t exp_n_audio_ctx = 0; // 0 - use default
|
| 956 |
|
|
|
|
|
|
|
| 957 |
struct vad_segment_info {
|
| 958 |
float orig_start;
|
| 959 |
float orig_end;
|
|
@@ -3853,6 +3855,11 @@ void whisper_free_state(struct whisper_state * state) {
|
|
| 3853 |
// [EXPERIMENTAL] Token-level timestamps with DTW
|
| 3854 |
aheads_masks_free(state->aheads_masks);
|
| 3855 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3856 |
delete state;
|
| 3857 |
}
|
| 3858 |
}
|
|
@@ -6613,12 +6620,16 @@ static bool whisper_vad(
|
|
| 6613 |
WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
|
| 6614 |
filtered_n_samples = 0;
|
| 6615 |
|
| 6616 |
-
|
| 6617 |
-
|
| 6618 |
-
|
| 6619 |
-
|
| 6620 |
-
|
|
|
|
|
|
|
|
|
|
| 6621 |
}
|
|
|
|
| 6622 |
|
| 6623 |
const whisper_vad_params & vad_params = params.vad_params;
|
| 6624 |
|
|
|
|
| 954 |
// [EXPERIMENTAL] speed-up techniques
|
| 955 |
int32_t exp_n_audio_ctx = 0; // 0 - use default
|
| 956 |
|
| 957 |
+
whisper_vad_context * vad_context = nullptr;
|
| 958 |
+
|
| 959 |
struct vad_segment_info {
|
| 960 |
float orig_start;
|
| 961 |
float orig_end;
|
|
|
|
| 3855 |
// [EXPERIMENTAL] Token-level timestamps with DTW
|
| 3856 |
aheads_masks_free(state->aheads_masks);
|
| 3857 |
|
| 3858 |
+
if (state->vad_context != nullptr) {
|
| 3859 |
+
whisper_vad_free(state->vad_context);
|
| 3860 |
+
state->vad_context = nullptr;
|
| 3861 |
+
}
|
| 3862 |
+
|
| 3863 |
delete state;
|
| 3864 |
}
|
| 3865 |
}
|
|
|
|
| 6620 |
WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
|
| 6621 |
filtered_n_samples = 0;
|
| 6622 |
|
| 6623 |
+
if (state->vad_context == nullptr) {
|
| 6624 |
+
struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
|
| 6625 |
+
struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(params.vad_model_path, vad_ctx_params);
|
| 6626 |
+
if (vctx == nullptr) {
|
| 6627 |
+
WHISPER_LOG_ERROR("%s: failed to initialize VAD context\n", __func__);
|
| 6628 |
+
return false;
|
| 6629 |
+
}
|
| 6630 |
+
state->vad_context = vctx;
|
| 6631 |
}
|
| 6632 |
+
auto vctx = state->vad_context;
|
| 6633 |
|
| 6634 |
const whisper_vad_params & vad_params = params.vad_params;
|
| 6635 |
|