Spaces:

natasa365
/

whisper.cpp

Sleeping

App Files Files Community

danbev commited on Jun 13

Commit

58d6e4e

unverified ·

1 Parent(s): c5f7b7e

server : add Voice Activity Detection (VAD) support (#3246)

Browse files

* server : add Voice Activity Detection (VAD) support

This commit adds support for Voice Activity Detection (VAD) in the
server example.

The motivation for this is to enable VAD processing when using
whisper-server.

Resolves: https://github.com/ggml-org/whisper.cpp/issues/3089

* server : add VAD parameters to usage in README.md [no ci]

This commit also adds a few missing parameters.

* server : fix conflicting short options [no ci]

Files changed (2) hide show

examples/server/README.md +20 -0
examples/server/server.cpp +71 -0

examples/server/README.md CHANGED Viewed

@@ -23,6 +23,7 @@ options:
   -sow,      --split-on-word     [false  ] split on word rather than on token
   -bo N,     --best-of N         [2      ] number of best candidates to keep
   -bs N,     --beam-size N       [-1     ] beam size for beam search
   -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
   -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
   -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
@@ -41,9 +42,28 @@ options:
              --prompt PROMPT     [       ] initial prompt
   -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
   -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
   --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
   --port PORT,                   [8080   ] Port number for the server
   --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
 ```
 > [!WARNING]

   -sow,      --split-on-word     [false  ] split on word rather than on token
   -bo N,     --best-of N         [2      ] number of best candidates to keep
   -bs N,     --beam-size N       [-1     ] beam size for beam search
+  -ac N,     --audio-ctx N       [0      ] audio context size (0 - all)
   -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
   -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
   -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
              --prompt PROMPT     [       ] initial prompt
   -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
   -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
+  -dtw MODEL --dtw MODEL         [       ] compute token-level timestamps
   --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
   --port PORT,                   [8080   ] Port number for the server
+  --public PATH,                 [examples/server/public] Path to the public folder
+  --request-path PATH,           [       ] Request path for all requests
+  --inference-path PATH,         [/inference] Inference path for all requests
   --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
+  -sns,      --suppress-nst      [false  ] suppress non-speech tokens
+  -nth N,    --no-speech-thold N [0.60   ] no speech threshold
+  -nc,       --no-context        [false  ] do not use previous audio context
+  -ng,       --no-gpu            [false  ] do not use gpu
+  -fa,       --flash-attn        [false  ] flash attention
+Voice Activity Detection (VAD) options:
+             --vad                           [false  ] enable Voice Activity Detection (VAD)
+  -vm FNAME, --vad-model FNAME               [       ] VAD model path
+  -vt N,     --vad-threshold N               [0.50   ] VAD threshold for speech recognition
+  -vspd N,   --vad-min-speech-duration-ms  N [250    ] VAD min speech duration (0.0-1.0)
+  -vsd N,    --vad-min-silence-duration-ms N [100    ] VAD min silence duration (to split segments)
+  -vmsd N,   --vad-max-speech-duration-s   N [FLT_MAX] VAD max speech duration (auto-split longer)
+  -vp N,     --vad-speech-pad-ms           N [30     ] VAD speech padding (extend segments)
+  -vo N,     --vad-samples-overlap         N [0.10   ] VAD samples overlap (seconds between segments)
 ```
 > [!WARNING]

examples/server/server.cpp CHANGED Viewed

@@ -5,6 +5,7 @@
 #include "httplib.h"
 #include "json.hpp"
 #include <chrono>
 #include <cmath>
 #include <cstdio>
@@ -90,6 +91,16 @@ struct whisper_params {
     std::string openvino_encode_device = "CPU";
     std::string dtw = "";
 };
 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -140,6 +151,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -nc,       --no-context        [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
     fprintf(stderr, "  -ng,       --no-gpu            [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
     fprintf(stderr, "  -fa,       --flash-attn        [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
     fprintf(stderr, "\n");
 }
@@ -195,6 +218,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
         else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
         else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
         else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             whisper_print_usage(argc, argv, params, sparams);
@@ -511,6 +544,34 @@ void get_req_parameters(const Request & req, whisper_params & params)
     {
         params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
     }
 }
 }  // namespace
@@ -829,6 +890,16 @@ int main(int argc, char ** argv) {
             wparams.suppress_nst     = params.suppress_nst;
             whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
             // this callback is called on each new segment

 #include "httplib.h"
 #include "json.hpp"
+#include <cfloat>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
     std::string openvino_encode_device = "CPU";
     std::string dtw = "";
+    // Voice Activity Detection (VAD) parameters
+    bool        vad           = false;
+    std::string vad_model     = "";
+    float       vad_threshold = 0.5f;
+    int         vad_min_speech_duration_ms = 250;
+    int         vad_min_silence_duration_ms = 100;
+    float       vad_max_speech_duration_s = FLT_MAX;
+    int         vad_speech_pad_ms = 30;
+    float       vad_samples_overlap = 0.1f;
 };
 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
     fprintf(stderr, "  -nc,       --no-context        [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
     fprintf(stderr, "  -ng,       --no-gpu            [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
     fprintf(stderr, "  -fa,       --flash-attn        [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
+    // Voice Activity Detection (VAD) parameters
+    fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
+    fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
+    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
+    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
+    fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
+    fprintf(stderr, "  -vsd N,    --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n",      params.vad_min_silence_duration_ms);
+    fprintf(stderr, "  -vmsd N,   --vad-max-speech-duration-s   N [%-7s] VAD max speech duration (auto-split longer)\n",      params.vad_max_speech_duration_s == FLT_MAX ?
+                                                                                                                                  std::string("FLT_MAX").c_str() :
+                                                                                                                                  std::to_string(params.vad_max_speech_duration_s).c_str());
+    fprintf(stderr, "  -vp N,     --vad-speech-pad-ms           N [%-7d] VAD speech padding (extend segments)\n",             params.vad_speech_pad_ms);
+    fprintf(stderr, "  -vo N,     --vad-samples-overlap         N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
     fprintf(stderr, "\n");
 }
         else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
         else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
         else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
+        // Voice Activity Detection (VAD)
+        else if (                  arg == "--vad")                         { params.vad                         = true; }
+        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = argv[++i]; }
+        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(argv[++i]); }
+        else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(argv[++i]); }
+        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(argv[++i]); }
+        else if (arg == "-vo"   || arg == "--vad-samples-overlap")         { params.vad_samples_overlap         = std::stof(argv[++i]); }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             whisper_print_usage(argc, argv, params, sparams);
     {
         params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
     }
+    if (req.has_file("vad"))
+    {
+        params.vad = parse_str_to_bool(req.get_file_value("vad").content);
+    }
+    if (req.has_file("vad_threshold"))
+    {
+        params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
+    }
+    if (req.has_file("vad_min_speech_duration_ms"))
+    {
+        params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
+    }
+    if (req.has_file("vad_min_silence_duration_ms"))
+    {
+        params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
+    }
+    if (req.has_file("vad_max_speech_duration_s"))
+    {
+        params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
+    }
+    if (req.has_file("vad_speech_pad_ms"))
+    {
+        params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
+    }
+    if (req.has_file("vad_samples_overlap"))
+    {
+        params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
+    }
 }
 }  // namespace
             wparams.suppress_nst     = params.suppress_nst;
+            wparams.vad              = params.vad;
+            wparams.vad_model_path   = params.vad_model.c_str();
+            wparams.vad_params.threshold               = params.vad_threshold;
+            wparams.vad_params.min_speech_duration_ms  = params.vad_min_speech_duration_ms;
+            wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
+            wparams.vad_params.max_speech_duration_s   = params.vad_max_speech_duration_s;
+            wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
+            wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
             whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
             // this callback is called on each new segment