danbev commited on
Commit
58d6e4e
·
unverified ·
1 Parent(s): c5f7b7e

server : add Voice Activity Detection (VAD) support (#3246)

Browse files

* server : add Voice Activity Detection (VAD) support

This commit adds support for Voice Activity Detection (VAD) in the
server example.

The motivation for this is to enable VAD processing when using
whisper-server.

Resolves: https://github.com/ggml-org/whisper.cpp/issues/3089

* server : add VAD parameters to usage in README.md [no ci]

This commit also adds a few missing parameters.

* server : fix conflicting short options [no ci]

examples/server/README.md CHANGED
@@ -23,6 +23,7 @@ options:
23
  -sow, --split-on-word [false ] split on word rather than on token
24
  -bo N, --best-of N [2 ] number of best candidates to keep
25
  -bs N, --beam-size N [-1 ] beam size for beam search
 
26
  -wt N, --word-thold N [0.01 ] word timestamp probability threshold
27
  -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
28
  -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
@@ -41,9 +42,28 @@ options:
41
  --prompt PROMPT [ ] initial prompt
42
  -m FNAME, --model FNAME [models/ggml-base.en.bin] model path
43
  -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
 
44
  --host HOST, [127.0.0.1] Hostname/ip-adress for the server
45
  --port PORT, [8080 ] Port number for the server
 
 
 
46
  --convert, [false ] Convert audio to WAV, requires ffmpeg on the server
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ```
48
 
49
  > [!WARNING]
 
23
  -sow, --split-on-word [false ] split on word rather than on token
24
  -bo N, --best-of N [2 ] number of best candidates to keep
25
  -bs N, --beam-size N [-1 ] beam size for beam search
26
+ -ac N, --audio-ctx N [0 ] audio context size (0 - all)
27
  -wt N, --word-thold N [0.01 ] word timestamp probability threshold
28
  -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
29
  -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
 
42
  --prompt PROMPT [ ] initial prompt
43
  -m FNAME, --model FNAME [models/ggml-base.en.bin] model path
44
  -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
45
+ -dtw MODEL --dtw MODEL [ ] compute token-level timestamps
46
  --host HOST, [127.0.0.1] Hostname/ip-adress for the server
47
  --port PORT, [8080 ] Port number for the server
48
+ --public PATH, [examples/server/public] Path to the public folder
49
+ --request-path PATH, [ ] Request path for all requests
50
+ --inference-path PATH, [/inference] Inference path for all requests
51
  --convert, [false ] Convert audio to WAV, requires ffmpeg on the server
52
+ -sns, --suppress-nst [false ] suppress non-speech tokens
53
+ -nth N, --no-speech-thold N [0.60 ] no speech threshold
54
+ -nc, --no-context [false ] do not use previous audio context
55
+ -ng, --no-gpu [false ] do not use gpu
56
+ -fa, --flash-attn [false ] flash attention
57
+
58
+ Voice Activity Detection (VAD) options:
59
+ --vad [false ] enable Voice Activity Detection (VAD)
60
+ -vm FNAME, --vad-model FNAME [ ] VAD model path
61
+ -vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition
62
+ -vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0)
63
+ -vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments)
64
+ -vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer)
65
+ -vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments)
66
+ -vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments)
67
  ```
68
 
69
  > [!WARNING]
examples/server/server.cpp CHANGED
@@ -5,6 +5,7 @@
5
  #include "httplib.h"
6
  #include "json.hpp"
7
 
 
8
  #include <chrono>
9
  #include <cmath>
10
  #include <cstdio>
@@ -90,6 +91,16 @@ struct whisper_params {
90
  std::string openvino_encode_device = "CPU";
91
 
92
  std::string dtw = "";
 
 
 
 
 
 
 
 
 
 
93
  };
94
 
95
  void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -140,6 +151,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
140
  fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
141
  fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
142
  fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
 
 
 
 
 
 
 
 
 
 
 
 
143
  fprintf(stderr, "\n");
144
  }
145
 
@@ -195,6 +218,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
195
  else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
196
  else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
197
  else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
 
 
 
 
 
 
 
 
 
 
198
  else {
199
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
200
  whisper_print_usage(argc, argv, params, sparams);
@@ -511,6 +544,34 @@ void get_req_parameters(const Request & req, whisper_params & params)
511
  {
512
  params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
513
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  }
515
 
516
  } // namespace
@@ -829,6 +890,16 @@ int main(int argc, char ** argv) {
829
 
830
  wparams.suppress_nst = params.suppress_nst;
831
 
 
 
 
 
 
 
 
 
 
 
832
  whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
833
 
834
  // this callback is called on each new segment
 
5
  #include "httplib.h"
6
  #include "json.hpp"
7
 
8
+ #include <cfloat>
9
  #include <chrono>
10
  #include <cmath>
11
  #include <cstdio>
 
91
  std::string openvino_encode_device = "CPU";
92
 
93
  std::string dtw = "";
94
+
95
+ // Voice Activity Detection (VAD) parameters
96
+ bool vad = false;
97
+ std::string vad_model = "";
98
+ float vad_threshold = 0.5f;
99
+ int vad_min_speech_duration_ms = 250;
100
+ int vad_min_silence_duration_ms = 100;
101
+ float vad_max_speech_duration_s = FLT_MAX;
102
+ int vad_speech_pad_ms = 30;
103
+ float vad_samples_overlap = 0.1f;
104
  };
105
 
106
  void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
 
151
  fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
152
  fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
153
  fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
154
+ // Voice Activity Detection (VAD) parameters
155
+ fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
156
+ fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
157
+ fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
158
+ fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
159
+ fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
160
+ fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
161
+ fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
162
+ std::string("FLT_MAX").c_str() :
163
+ std::to_string(params.vad_max_speech_duration_s).c_str());
164
+ fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
165
+ fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
166
  fprintf(stderr, "\n");
167
  }
168
 
 
218
  else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
219
  else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
220
  else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
221
+
222
+ // Voice Activity Detection (VAD)
223
+ else if ( arg == "--vad") { params.vad = true; }
224
+ else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
225
+ else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
226
+ else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
227
+ else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
228
+ else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
229
+ else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
230
+ else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
231
  else {
232
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
233
  whisper_print_usage(argc, argv, params, sparams);
 
544
  {
545
  params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
546
  }
547
+ if (req.has_file("vad"))
548
+ {
549
+ params.vad = parse_str_to_bool(req.get_file_value("vad").content);
550
+ }
551
+ if (req.has_file("vad_threshold"))
552
+ {
553
+ params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
554
+ }
555
+ if (req.has_file("vad_min_speech_duration_ms"))
556
+ {
557
+ params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
558
+ }
559
+ if (req.has_file("vad_min_silence_duration_ms"))
560
+ {
561
+ params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
562
+ }
563
+ if (req.has_file("vad_max_speech_duration_s"))
564
+ {
565
+ params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
566
+ }
567
+ if (req.has_file("vad_speech_pad_ms"))
568
+ {
569
+ params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
570
+ }
571
+ if (req.has_file("vad_samples_overlap"))
572
+ {
573
+ params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
574
+ }
575
  }
576
 
577
  } // namespace
 
890
 
891
  wparams.suppress_nst = params.suppress_nst;
892
 
893
+ wparams.vad = params.vad;
894
+ wparams.vad_model_path = params.vad_model.c_str();
895
+
896
+ wparams.vad_params.threshold = params.vad_threshold;
897
+ wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
898
+ wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
899
+ wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
900
+ wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
901
+ wparams.vad_params.samples_overlap = params.vad_samples_overlap;
902
+
903
  whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
904
 
905
  // this callback is called on each new segment