eschmidbauer ggerganov commited on
Commit
dd44653
·
unverified ·
1 Parent(s): b0e83a9

server : add dtw (#2044)

Browse files

* server.cpp: add dtw

* Update examples/server/server.cpp

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (1) hide show
  1. examples/server/server.cpp +48 -0
examples/server/server.cpp CHANGED
@@ -87,6 +87,8 @@ struct whisper_params {
87
  std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
88
 
89
  std::string openvino_encode_device = "CPU";
 
 
90
  };
91
 
92
  void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -126,6 +128,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
126
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
127
  fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
128
  // server params
 
129
  fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
130
  fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
131
  fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
@@ -173,6 +176,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
173
  else if ( arg == "--prompt") { params.prompt = argv[++i]; }
174
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
175
  else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
 
176
  else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
177
  // server params
178
  else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
@@ -499,6 +503,49 @@ int main(int argc, char ** argv) {
499
  // whisper init
500
  struct whisper_context_params cparams = whisper_context_default_params();
501
  cparams.use_gpu = params.use_gpu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
  struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
504
 
@@ -865,6 +912,7 @@ int main(int argc, char ** argv) {
865
  if (!params.no_timestamps) {
866
  word["start"] = token.t0 * 0.01;
867
  word["end"] = token.t1 * 0.01;
 
868
  }
869
  word["probability"] = token.p;
870
  total_logprob += token.plog;
 
87
  std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
88
 
89
  std::string openvino_encode_device = "CPU";
90
+
91
+ std::string dtw = "";
92
  };
93
 
94
  void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
 
128
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
129
  fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
130
  // server params
131
+ fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
132
  fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
133
  fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
134
  fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
 
176
  else if ( arg == "--prompt") { params.prompt = argv[++i]; }
177
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
178
  else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
179
+ else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; }
180
  else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
181
  // server params
182
  else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
 
503
  // whisper init
504
  struct whisper_context_params cparams = whisper_context_default_params();
505
  cparams.use_gpu = params.use_gpu;
506
+ if (!params.dtw.empty()) {
507
+ cparams.dtw_token_timestamps = true;
508
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE;
509
+
510
+ if (params.dtw == "tiny") {
511
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY;
512
+ }
513
+ if (params.dtw == "tiny.en") {
514
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN;
515
+ }
516
+ if (params.dtw == "base") {
517
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
518
+ }
519
+ if (params.dtw == "base.en") {
520
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN;
521
+ }
522
+ if (params.dtw == "small") {
523
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL;
524
+ }
525
+ if (params.dtw == "small.en") {
526
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN;
527
+ }
528
+ if (params.dtw == "medium") {
529
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM;
530
+ }
531
+ if (params.dtw == "medium.en") {
532
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN;
533
+ }
534
+ if (params.dtw == "large.v1") {
535
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
536
+ }
537
+ if (params.dtw == "large.v2") {
538
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
539
+ }
540
+ if (params.dtw == "large.v3") {
541
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
542
+ }
543
+
544
+ if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
545
+ fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
546
+ return 3;
547
+ }
548
+ }
549
 
550
  struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
551
 
 
912
  if (!params.no_timestamps) {
913
  word["start"] = token.t0 * 0.01;
914
  word["end"] = token.t1 * 0.01;
915
+ word["t_dtw"] = token.t_dtw;
916
  }
917
  word["probability"] = token.p;
918
  total_logprob += token.plog;