Spaces:
Running
Running
server : add dtw (#2044)
Browse files* server.cpp: add dtw
* Update examples/server/server.cpp
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- examples/server/server.cpp +48 -0
examples/server/server.cpp
CHANGED
|
@@ -87,6 +87,8 @@ struct whisper_params {
|
|
| 87 |
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
|
| 88 |
|
| 89 |
std::string openvino_encode_device = "CPU";
|
|
|
|
|
|
|
| 90 |
};
|
| 91 |
|
| 92 |
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
|
|
@@ -126,6 +128,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 126 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 127 |
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
|
| 128 |
// server params
|
|
|
|
| 129 |
fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
|
| 130 |
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
|
| 131 |
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
|
|
@@ -173,6 +176,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|
| 173 |
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
| 174 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 175 |
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
|
|
|
| 176 |
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
| 177 |
// server params
|
| 178 |
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
|
|
@@ -499,6 +503,49 @@ int main(int argc, char ** argv) {
|
|
| 499 |
// whisper init
|
| 500 |
struct whisper_context_params cparams = whisper_context_default_params();
|
| 501 |
cparams.use_gpu = params.use_gpu;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
| 504 |
|
|
@@ -865,6 +912,7 @@ int main(int argc, char ** argv) {
|
|
| 865 |
if (!params.no_timestamps) {
|
| 866 |
word["start"] = token.t0 * 0.01;
|
| 867 |
word["end"] = token.t1 * 0.01;
|
|
|
|
| 868 |
}
|
| 869 |
word["probability"] = token.p;
|
| 870 |
total_logprob += token.plog;
|
|
|
|
| 87 |
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
|
| 88 |
|
| 89 |
std::string openvino_encode_device = "CPU";
|
| 90 |
+
|
| 91 |
+
std::string dtw = "";
|
| 92 |
};
|
| 93 |
|
| 94 |
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
|
|
|
|
| 128 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 129 |
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
|
| 130 |
// server params
|
| 131 |
+
fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
|
| 132 |
fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
|
| 133 |
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
|
| 134 |
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
|
|
|
|
| 176 |
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
| 177 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 178 |
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
| 179 |
+
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; }
|
| 180 |
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
| 181 |
// server params
|
| 182 |
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
|
|
|
|
| 503 |
// whisper init
|
| 504 |
struct whisper_context_params cparams = whisper_context_default_params();
|
| 505 |
cparams.use_gpu = params.use_gpu;
|
| 506 |
+
if (!params.dtw.empty()) {
|
| 507 |
+
cparams.dtw_token_timestamps = true;
|
| 508 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE;
|
| 509 |
+
|
| 510 |
+
if (params.dtw == "tiny") {
|
| 511 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY;
|
| 512 |
+
}
|
| 513 |
+
if (params.dtw == "tiny.en") {
|
| 514 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN;
|
| 515 |
+
}
|
| 516 |
+
if (params.dtw == "base") {
|
| 517 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
|
| 518 |
+
}
|
| 519 |
+
if (params.dtw == "base.en") {
|
| 520 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN;
|
| 521 |
+
}
|
| 522 |
+
if (params.dtw == "small") {
|
| 523 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL;
|
| 524 |
+
}
|
| 525 |
+
if (params.dtw == "small.en") {
|
| 526 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN;
|
| 527 |
+
}
|
| 528 |
+
if (params.dtw == "medium") {
|
| 529 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM;
|
| 530 |
+
}
|
| 531 |
+
if (params.dtw == "medium.en") {
|
| 532 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN;
|
| 533 |
+
}
|
| 534 |
+
if (params.dtw == "large.v1") {
|
| 535 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
|
| 536 |
+
}
|
| 537 |
+
if (params.dtw == "large.v2") {
|
| 538 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
|
| 539 |
+
}
|
| 540 |
+
if (params.dtw == "large.v3") {
|
| 541 |
+
cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
|
| 545 |
+
fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
|
| 546 |
+
return 3;
|
| 547 |
+
}
|
| 548 |
+
}
|
| 549 |
|
| 550 |
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
| 551 |
|
|
|
|
| 912 |
if (!params.no_timestamps) {
|
| 913 |
word["start"] = token.t0 * 0.01;
|
| 914 |
word["end"] = token.t1 * 0.01;
|
| 915 |
+
word["t_dtw"] = token.t_dtw;
|
| 916 |
}
|
| 917 |
word["probability"] = token.p;
|
| 918 |
total_logprob += token.plog;
|